def _retrieve_features(adaptor, primary_id): sql = "SELECT seqfeature_id, type.name, rank" \ " FROM seqfeature join term type on (type_term_id = type.term_id)" \ " WHERE bioentry_id = %s" \ " ORDER BY rank" results = adaptor.execute_and_fetchall(sql, (primary_id,)) seq_feature_list = [] for seqfeature_id, seqfeature_type, seqfeature_rank in results: # Get qualifiers [except for db_xref which is stored separately] qvs = adaptor.execute_and_fetchall( "SELECT name, value" " FROM seqfeature_qualifier_value join term using (term_id)" " WHERE seqfeature_id = %s" " ORDER BY rank", (seqfeature_id,)) qualifiers = {} for qv_name, qv_value in qvs: qualifiers.setdefault(qv_name, []).append(qv_value) # Get db_xrefs [special case of qualifiers] qvs = adaptor.execute_and_fetchall( "SELECT dbxref.dbname, dbxref.accession" " FROM dbxref join seqfeature_dbxref using (dbxref_id)" " WHERE seqfeature_dbxref.seqfeature_id = %s" " ORDER BY rank", (seqfeature_id,)) for qv_name, qv_value in qvs: value = "%s:%s" % (qv_name, qv_value) qualifiers.setdefault("db_xref", []).append(value) # Get locations results = adaptor.execute_and_fetchall( "SELECT location_id, start_pos, end_pos, strand" " FROM location" " WHERE seqfeature_id = %s" " ORDER BY rank", (seqfeature_id,)) locations = [] # convert to Python standard form # Convert strand = 0 to strand = None # re: comment in Loader.py: # Biopython uses None when we don't know strand information but # BioSQL requires something (non null) and sets this as zero # So we'll use the strand or 0 if Biopython spits out None for location_id, start, end, strand in results: if start: start -= 1 if strand == 0: strand = None if strand not in (+1, -1, None): raise ValueError("Invalid strand %s found in database for " "seqfeature_id %s" % (strand, seqfeature_id)) if end < start: import warnings from Bio import BiopythonWarning warnings.warn("Inverted location start/end (%i and %i) for " "seqfeature_id %s" % (start, end, seqfeature_id), BiopythonWarning) locations.append((location_id, start, end, strand)) # Get possible remote reference information remote_results = adaptor.execute_and_fetchall( "SELECT location_id, dbname, accession, version" " FROM location join dbxref using (dbxref_id)" " WHERE seqfeature_id = %s", (seqfeature_id,)) lookup = {} for location_id, dbname, accession, version in remote_results: if version and version != "0": v = "%s.%s" % (accession, version) else: v = accession # subfeature remote location db_ref are stored as a empty string when # not present if dbname == "": dbname = None lookup[location_id] = (dbname, v) feature = SeqFeature.SeqFeature(type=seqfeature_type) # Store the key as a private property feature._seqfeature_id = seqfeature_id feature.qualifiers = qualifiers if len(locations) == 0: pass elif len(locations) == 1: location_id, start, end, strand = locations[0] # See Bug 2677, we currently don't record the location_operator # For consistency with older versions Biopython, default to "". feature.location_operator = \ _retrieve_location_qualifier_value(adaptor, location_id) dbname, version = lookup.get(location_id, (None, None)) feature.location = SeqFeature.FeatureLocation(start, end) feature.strand = strand feature.ref_db = dbname feature.ref = version else: sub_features = feature.sub_features assert sub_features == [] for location in locations: location_id, start, end, strand = location dbname, version = lookup.get(location_id, (None, None)) subfeature = SeqFeature.SeqFeature() subfeature.type = seqfeature_type subfeature.location = SeqFeature.FeatureLocation(start, end) # subfeature.location_operator = \ # _retrieve_location_qualifier_value(adaptor, location_id) subfeature.strand = strand subfeature.ref_db = dbname subfeature.ref = version sub_features.append(subfeature) # Locations are in order, but because of remote locations for # sub-features they are not necessarily in numerical order: strands = set(sf.strand for sf in sub_features) if len(strands) == 1 and -1 in strands: # Evil hack time for backwards compatibility # TODO - Check if BioPerl and (old) Biopython did the same, # we may have an existing incompatibility lurking here... locs = [f.location for f in sub_features[::-1]] else: # All forward, or mixed strands locs = [f.location for f in sub_features] feature.location = SeqFeature.CompoundLocation( locs, seqfeature_type) # TODO - See Bug 2677 - we don't yet record location_operator, # so for consistency with older versions of Biopython default # to assuming its a join. feature.location_operator = "join" seq_feature_list.append(feature) return seq_feature_list
def gff_to_gbk(genome_fasta, prot_fasta, annot_table, gff_file, species_name, gbk_out): """ From a genome fasta (containing each contigs of the genome), a protein fasta (containing each protein sequence), an annotation table (containing gene name associated with GO terms, InterPro and EC), a gff file (containing gene, exon, mRNA, ncRNA, tRNA), a contig information table (containing species name, taxon ID, ..) create a genbank file. """ print('Creating GFF database (gffutils)') # Create the gff database file. # gffutils use sqlite3 file-based database to access data inside GFF. # ':memory:' ask gffutils to keep database in memory instead of writting in a file. gff_database = gffutils.create_db(gff_file, ':memory:', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True) # Length of your gene ID. # Catch it in the GFF database. # It's pretty dumb as we go into a loop for one information. # But I don't find another way to catch the length of gene_id. length_gene_id = 0 for gene in gff_database.features_of_type('gene'): length_gene_id = len(gene.id.replace('gene:', '')) break # Get the longest contig ID to check if all contig IDs have the # same length, if not add 0 (at the supposed position of the number). longest_contig_id = "" for contig_for_length_id in gff_database.features_of_type( 'sequence_assembly'): if len(longest_contig_id) < len(contig_for_length_id.id): longest_contig_id = contig_for_length_id.id print('Formatting fasta and annotation file') # Dictionary with scaffold/chromosome id as key and sequence as value. contig_seqs = OrderedDict() for record in SeqIO.parse(genome_fasta, "fasta"): id_contig = record.id contig_seqs[id_contig] = record.seq # Dictionary with gene id as key and protein sequence as value. gene_protein_seq = {} for record in SeqIO.parse(prot_fasta, "fasta"): gene_protein_seq[record.id] = record.seq # Create a taxonomy dictionary querying the EBI. species_informations = create_taxonomic_data(species_name) # Read a tsv file containing GO terms, Interpro and EC associated with gene name. mapping_data = pa.read_csv(annot_table, sep='\t') mapping_data.replace(np.nan, '', inplace=True) gene_column, go_column, ec_column, ipr_column = find_column_of_interest( mapping_data) mapping_data.set_index(gene_column, inplace=True) # Dictionary with gene id as key and GO terms/Interpro/EC as value. annot_GOs = mapping_data[go_column].to_dict() annot_IPRs = mapping_data[ipr_column].to_dict() annot_ECs = mapping_data[ec_column].to_dict() # Query Gene Ontology to extract namespaces and alternative IDs. df_go_namespace, df_go_alternative = create_GO_dataframes() # Dictionary GO id as term and GO namespace as value. df_go_namespace.set_index('GO', inplace=True) go_namespaces = df_go_namespace['namespace'].to_dict() # Dictionary GO id as term and GO alternatives id as value. df_go_alternative.set_index('GO', inplace=True) go_alternatives = df_go_alternative['alternative_GO'].to_dict() # Create a dataframe containing each exon with informations (gene, start, end and strand) df_exons = pa.DataFrame( columns=['exon_id', 'gene_id', 'start', 'end', 'strand']) print('Searching for exons') temporary_datas = [] # Search for all exons in gff database and extract start position (have to minus one to get the right position) # the end position, the strand (have to change from str to int) and the gene ID. # Then add it to a list of dictionary that will be added to the dataframe. for exon in gff_database.features_of_type('exon'): start_position = exon.start - 1 end_position = exon.end strand = strand_change(exon.strand) gene_id = exon.id.replace('exon:', '')[:-2] temporary_datas.append({ 'exon_id': exon.id, 'gene_id': gene_id, 'start': start_position, 'end': end_position, 'strand': strand }) df_exons = df_exons.append(temporary_datas) # All SeqRecord objects will be stored in a list and then give to the SeqIO writer to create the genbank. seq_objects = [] print('Assembling Genbank informations') # Iterate through each contig. # Then iterate through gene and throug RNA linked with the gene. # Then look if protein informations are available. for contig_id in sorted(contig_seqs): # Data for each contig. record = contig_info(contig_id, contig_seqs[contig_id], species_informations) for gene in gff_database.features_of_type('gene'): gene_contig = gene.chrom if gene_contig == contig_id: id_gene = gene.id start_position = gene.start - 1 end_position = gene.end strand = strand_change(gene.strand) new_feature_gene = sf.SeqFeature(sf.FeatureLocation( start_position, end_position, strand), type="gene") new_feature_gene.qualifiers['locus_tag'] = id_gene # Add gene information to contig record. record.features.append(new_feature_gene) # Search and add RNAs. gene_informations = [ gene, id_gene, start_position, end_position, strand ] record = search_and_add_RNA(gff_database, gene_informations, record, 'mRNA') record = search_and_add_RNA(gff_database, gene_informations, record, 'tRNA') record = search_and_add_RNA(gff_database, gene_informations, record, 'ncRNA') record = search_and_add_RNA(gff_database, gene_informations, record, 'lncRNA') # Search for pseudogene and add them. record = search_and_add_pseudogene(gff_database, gene, record, df_exons, gene_protein_seq) # Create CDS using exons, if no exon use gene information location_exons = [] # Use parent mRNA in gff to find CDS. # With this we take the isoform of gene. for mrna in gff_database.children(gene, featuretype="mRNA", order_by='start'): mrna_id = mrna.id # Select exon corresponding to the gene. # Then iterate for each exon and extract information. df_temp = df_exons[df_exons['gene_id'] == mrna_id] for _, row in df_temp.iterrows(): new_feature_location_exons = sf.FeatureLocation( row['start'], row['end'], row['strand']) location_exons.append(new_feature_location_exons) if location_exons and len(location_exons) >= 2: exon_compound_locations = sf.CompoundLocation( location_exons, operator='join') new_feature_cds = sf.SeqFeature( exon_compound_locations, type='CDS') else: new_feature_cds = sf.SeqFeature(sf.FeatureLocation( start_position, end_position, strand), type="CDS") new_feature_cds.qualifiers[ 'translation'] = gene_protein_seq[mrna_id] new_feature_cds.qualifiers['locus_tag'] = id_gene # Add GO annotation according to the namespace. if mrna_id in annot_GOs: gene_gos = re.split(';|,', annot_GOs[mrna_id]) if gene_gos != [""]: go_components = [] go_functions = [] go_process = [] for go in gene_gos: # Check if GO term is not a deprecated one. # If yes take the corresponding one in alternative GO. if go not in go_namespaces: go_test = go_alternatives[go] else: go_test = go if go_namespaces[ go_test] == 'cellular_component': go_components.append(go) if go_namespaces[ go_test] == 'molecular_function': go_functions.append(go) if go_namespaces[ go_test] == 'biological_process': go_process.append(go) new_feature_cds.qualifiers[ 'go_component'] = go_components new_feature_cds.qualifiers[ 'go_function'] = go_functions new_feature_cds.qualifiers[ 'go_process'] = go_process # Add InterPro annotation. if mrna_id in annot_IPRs: gene_iprs = re.split(';|,', annot_IPRs[mrna_id]) if gene_iprs != [""]: new_feature_cds.qualifiers['db_xref'] = [ "InterPro:" + interpro for interpro in gene_iprs ] # Add EC annotation. if mrna_id in annot_ECs: gene_ecs = re.split(';|,', annot_ECs[mrna_id]) if gene_ecs != [""]: new_feature_cds.qualifiers['EC_number'] = [ ec.replace('ec:', '') for ec in gene_ecs ] # Add CDS information to contig record record.features.append(new_feature_cds) seq_objects.append(record) # Create Genbank with the list of SeqRecord. SeqIO.write(seq_objects, gbk_out, 'genbank')