예제 #1
0
    def load_gene_list(self,
                       file_name,
                       filter_havana=True,
                       protein_coding=False,
                       known_only=False):
        """Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries.
        
        *Keyword arguments:*

            - file_name -- The gencode .gtf file name.
        """
        # Opening GTF file
        try:
            gtf_file = open(file_name, "r")
        except Exception:
            print("Error: Cannot find the annotation file: " + file_name)
            print("Please check the path in ~/rgtdata/data.config")
            sys.exit(1)

        # Reading GTF file
        for line in gtf_file:

            # Processing line
            line = line.strip()
            if line[0] == "#": continue
            line_list = line.split("\t")
            try:
                if filter_havana and line_list[1] == "HAVANA": continue
            except:
                pass

            addt_list = line_list[8].split(";")
            addt_list = [_f for _f in addt_list if _f]

            # Processing additional list of options
            addt_dict = dict()
            for addt_element in addt_list:
                addt_element_list = addt_element.split(" ")
                addt_element_list = [_f for _f in addt_element_list if _f]
                # Removing " symbol from string options
                addt_element_list[1] = addt_element_list[1].replace("\"", "")
                addt_dict[addt_element_list[0]] = addt_element_list[1]

            # filter non-protein-coding sequences, if required
            if protein_coding:
                if "gene_type" not in addt_dict or addt_dict[
                        "gene_type"] != "protein_coding":
                    continue
                if "transcript_type" in addt_dict and addt_dict[
                        "transcript_type"] != "protein_coding":
                    continue

            # filter unknown sequences, if required
            if known_only:
                if "gene_status" not in addt_dict or addt_dict[
                        "gene_status"] != "KNOWN":
                    continue
                if "transcript_status" in addt_dict and addt_dict[
                        "transcript_status"] != "KNOWN":
                    continue

            # Removing dot from IDs
            addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0]
            try:
                addt_dict["transcript_id"] = addt_dict["transcript_id"].split(
                    ".")[0]
            except:
                pass

            # Creating final version of additional arguments
            final_addt_list = []
            for addt_key in [
                    "gene_id", "transcript_id", "gene_type", "gene_status",
                    "gene_name", "transcript_type", "transcript_status",
                    "transcript_name", "level"
            ]:
                try:
                    final_addt_list.append(addt_dict[addt_key])
                except Exception:
                    final_addt_list.append(None)

            # Handling score
            current_score = 0
            if AuxiliaryFunctions.string_is_int(line_list[5]):
                current_score = AuxiliaryFunctions.correct_standard_bed_score(
                    line_list[5])

            # Creating GenomicRegion
            genomic_region = GenomicRegion(chrom=line_list[0],
                                           initial=int(line_list[3]) - 1,
                                           final=int(line_list[4]),
                                           orientation=line_list[6],
                                           data=current_score)

            # Creating final vector
            extra_index_elements = [
                [], []
            ]  # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES
            final_vector = [
                genomic_region, line_list[1], line_list[2], line_list[7]
            ] + final_addt_list + extra_index_elements
            self.gene_list.append(final_vector)

        # Termination
        gtf_file.close()
예제 #2
0
    def load_gene_list(self, file_name, filter_havana=True):
        """
        Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries.

        Keyword arguments:
        file_name -- The gencode .gtf file name.
        
        Return: void.
        """
        # Opening GTF file
        try:
            gtf_file = open(file_name, "r")
        except Exception:
            pass  # TODO

        # Reading GTF file
        for line in gtf_file:

            # Processing line
            line = line.strip()
            if (line[0] == "#"): continue
            line_list = line.split("\t")
            if (filter_havana and line_list[1] == "HAVANA"): continue
            addt_list = line_list[8].split(";")
            addt_list = filter(None, addt_list)

            # Processing additional list of options
            addt_dict = dict()
            for addt_element in addt_list:
                addt_element_list = addt_element.split(" ")
                addt_element_list = filter(None, addt_element_list)
                addt_element_list[1] = addt_element_list[1].replace(
                    "\"", "")  # Removing " symbol from string options
                addt_dict[addt_element_list[0]] = addt_element_list[1]

            # Removing dot from IDs
            addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0]
            addt_dict["transcript_id"] = addt_dict["transcript_id"].split(
                ".")[0]

            # Creating final version of additional arguments
            final_addt_list = []
            for addt_key in [
                    "gene_id", "transcript_id", "gene_type", "gene_status",
                    "gene_name", "transcript_type", "transcript_status",
                    "transcript_name", "level"
            ]:
                try:
                    final_addt_list.append(addt_dict[addt_key])
                except Exception:
                    final_addt_list.append(None)

            # Handling score
            current_score = 0
            if (AuxiliaryFunctions.string_is_int(line_list[5])):
                current_score = AuxiliaryFunctions.correct_standard_bed_score(
                    line_list[5])

            # Creating GenomicRegion
            genomic_region = GenomicRegion(chrom=line_list[0],
                                           initial=int(line_list[3]) - 1,
                                           final=int(line_list[4]),
                                           orientation=line_list[6],
                                           data=current_score)

            # Creating final vector
            extra_index_elements = [
                [], []
            ]  # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES
            final_vector = [
                genomic_region, line_list[1], line_list[2], line_list[7]
            ] + final_addt_list + extra_index_elements
            self.gene_list.append(final_vector)

        # Termination
        gtf_file.close()
예제 #3
0
    def load_gene_list(self, file_name, filter_havana=True, protein_coding=False, known_only=False):
        """Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries.
        
        *Keyword arguments:*

            - file_name -- The gencode .gtf file name.
        """
        # Opening GTF file
        try: gtf_file = open(file_name, "r")
        except Exception: 
            print("Error: Cannot find the annotation file: "+file_name)
            print("Please check the path in ~/rgtdata/data.config")
            sys.exit(1)
        
        # Reading GTF file
        for line in gtf_file:

            # Processing line
            line = line.strip()
            if line[0] == "#": continue
            line_list = line.split("\t")
            try:
                if filter_havana and line_list[1] == "HAVANA": continue
            except: pass
            
            addt_list = line_list[8].split(";")
            addt_list = filter(None, addt_list)

            # Processing additional list of options
            addt_dict = dict()
            for addt_element in addt_list:
                addt_element_list = addt_element.split(" ")
                addt_element_list = filter(None, addt_element_list)
                # Removing " symbol from string options
                addt_element_list[1] = addt_element_list[1].replace("\"", "")
                addt_dict[addt_element_list[0]] = addt_element_list[1]

            # filter non-protein-coding sequences, if required
            if protein_coding:
                if "gene_type" not in addt_dict or addt_dict["gene_type"] != "protein_coding":
                    continue
                if "transcript_type" in addt_dict and addt_dict["transcript_type"] != "protein_coding":
                    continue

            # filter unknown sequences, if required
            if known_only:
                if "gene_status" not in addt_dict or addt_dict["gene_status"] != "KNOWN":
                    continue
                if "transcript_status" in addt_dict and addt_dict["transcript_status"] != "KNOWN":
                    continue
        
            # Removing dot from IDs
            addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0]
            try: addt_dict["transcript_id"] = addt_dict["transcript_id"].split(".")[0]
            except: pass

            # Creating final version of additional arguments
            final_addt_list = []
            for addt_key in ["gene_id", "transcript_id", "gene_type", "gene_status", "gene_name", 
                             "transcript_type", "transcript_status", "transcript_name", "level"]:
                try:
                    final_addt_list.append(addt_dict[addt_key])
                except Exception:
                    final_addt_list.append(None)

            # Handling score
            current_score = 0
            if AuxiliaryFunctions.string_is_int(line_list[5]):
                current_score = AuxiliaryFunctions.correct_standard_bed_score(line_list[5])
            
            # Creating GenomicRegion
            genomic_region = GenomicRegion(chrom=line_list[0],
                                           initial=int(line_list[3])-1,
                                           final=int(line_list[4]),
                                           orientation=line_list[6],
                                           data=current_score)

            # Creating final vector
            extra_index_elements = [[],[]] # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES
            final_vector = [genomic_region,line_list[1],line_list[2],line_list[7]] + final_addt_list + extra_index_elements
            self.gene_list.append(final_vector)

        # Termination
        gtf_file.close()
예제 #4
0
    def load_gene_list(self, file_name, filter_havana=True, protein_coding=False, known_only=False):
        """
        Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries.

        Keyword arguments:
        file_name -- The gencode .gtf file name.
        
        Return: void.
        """
        # Opening GTF file
        try: gtf_file = open(file_name,"r")
        except Exception: pass # TODO

        # Reading GTF file
        for line in gtf_file:
        
            # Processing line
            line = line.strip()
            if(line[0] == "#"): continue
            line_list = line.split("\t")
            if(filter_havana and line_list[1] == "HAVANA"): continue

            addt_list = line_list[8].split(";")

            if(protein_coding and "protein_coding" not in addt_list[2] ): continue
            if(known_only and "KNOWN" not in addt_list[3] ): continue
            
            if(protein_coding and "protein_coding" not in addt_list[5] ): continue
            if(known_only and "KNOWN" not in addt_list[6] ): continue
            addt_list = filter(None,addt_list)

            # Processing additional list of options
            addt_dict = dict()
            for addt_element in addt_list:
                addt_element_list = addt_element.split(" ")
                addt_element_list = filter(None,addt_element_list)
                addt_element_list[1] = addt_element_list[1].replace("\"","") # Removing " symbol from string options
                addt_dict[addt_element_list[0]] = addt_element_list[1]
        
            # Removing dot from IDs
            addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0]
            addt_dict["transcript_id"] = addt_dict["transcript_id"].split(".")[0]
                
                                                                                                                                                                                          # Creating final version of additional arguments
            final_addt_list = []
            for addt_key in ["gene_id", "transcript_id", "gene_type", "gene_status", "gene_name", 
                             "transcript_type", "transcript_status", "transcript_name", "level"]:
                try: final_addt_list.append(addt_dict[addt_key])
                except Exception: final_addt_list.append(None)

            # Handling score
            current_score = 0
            if(AuxiliaryFunctions.string_is_int(line_list[5])):
                current_score = AuxiliaryFunctions.correct_standard_bed_score(line_list[5])

            # Creating GenomicRegion
            genomic_region = GenomicRegion(chrom = line_list[0], 
                                           initial = int(line_list[3])-1, 
                                           final = int(line_list[4]), 
                                           orientation = line_list[6], 
                                           data = current_score)

            # Creating final vector
            extra_index_elements = [[],[]] # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES
            final_vector = [genomic_region,line_list[1],line_list[2],line_list[7]] + final_addt_list + extra_index_elements
            self.gene_list.append(final_vector)
            
        # Termination
        gtf_file.close()