def add_aragorn_features(assemblies, features, aragorn_file): current_assembly_id = None for line in open(aragorn_file): line = line.rstrip() if line.startswith('>'): m = re.match('>(\S+)', line) current_assembly_id = m.group(1) if current_assembly_id not in assemblies: assembly = things.Assembly(id=current_assembly_id, residues='') assemblies[current_assembly_id] = assembly else: cols = line.split() if len(cols) == 5: if cols[1].startswith('tRNA'): feat_type = 'tRNA' elif cols[1].startswith('tmRNA'): feat_type = 'tmRNA' elif cols[1].startswith('mtRNA'): feat_type = 'mtRNA' else: raise Exception("Unexpected type in ARAGORN, column value: {0}".format(cols[1])) feat_base = "{0}_{1}".format(feat_type, uuid.uuid4()) gene_id = "{0}_gene".format(feat_base) RNA_id = "{0}_{1}".format(feat_base, feat_type) m = re.match('(c*)\[(\d+),(\d+)\]', cols[2]) if m: rfmin = int(m.group(2)) - 1 rfmax = int(m.group(3)) if m.group(1): rstrand = -1 else: rstrand = 1 else: raise Exception("ERROR: unexpected coordinate format: {0}".format(cols[2])) current_assembly = assemblies[current_assembly_id] gene = things.Gene(id=gene_id) gene.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) features[gene_id] = gene current_assembly.add_gene(gene) if feat_type == 'tRNA': RNA = things.tRNA(id=RNA_id, parent=gene, anticodon=cols[4][1:4].upper()) gene.add_tRNA(RNA) else: RNA = things.tmRNA(id=RNA_id, parent=gene) gene.add_tmRNA(RNA) RNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) RNA.annotation = annotation.FunctionalAnnotation(product_name=cols[1]) features[RNA_id] = RNA
def process_gb_gene(gene): annot = annotation.FunctionalAnnotation() annot.other_attributes['ensembl_version'] = gene.qualifiers['gene'][0] m = re.match('(.+)\.\d+', annot.other_attributes['ensembl_version']) if m: annot.other_attributes['ensembl_id'] = m.group(1) else: raise Exception("Found an ensembl id ({0}) without a .version".format( annot.other_attributes['ensembl_version'])) annot.gene_symbol = get_gene_sym(gene) annot.product_name = get_product(gene) annot.other_attributes['loc'] = get_coordinates(gene) return annot
def initialize_polypeptides( log_fh, fasta_file ): ''' Reads a FASTA file of (presumably) polypeptide sequences and creates a dict of Polypeptide objects, keyed by ID, with bioannotation.FunctionalAnnotation objects attached. ''' seqs = utils.fasta_dict_from_file(fasta_file) polypeptides = dict() for seq_id in seqs: polypeptide = things.Polypeptide(id=seq_id, length=len(seqs[seq_id]['s']), residues=seqs[seq_id]['s']) annot = annotation.FunctionalAnnotation(product_name=DEFAULT_PRODUCT_NAME) log_fh.write("INFO: {0}: Set initial product name to '{1}'\n".format(seq_id, DEFAULT_PRODUCT_NAME)) polypeptide.annotation = annot polypeptides[seq_id] = polypeptide return polypeptides
def main(): parser = argparse.ArgumentParser( description='Convert GenBank flat files to GFF3 format') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GBK file') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created') parser.add_argument( '--with_fasta', dest='fasta', action='store_true', help= 'Include the FASTA section with genomic sequence at end of file. (default)' ) parser.add_argument('--no_fasta', dest='fasta', action='store_false') parser.set_defaults(fasta=True) args = parser.parse_args() ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) seqs_pending_writes = False features_skipped_count = 0 # each gb_record is a SeqRecord object for gb_record in SeqIO.parse(open(args.input_file, "r"), "genbank"): mol_id = gb_record.name if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) if len(str(gb_record.seq)) > 0: seqs_pending_writes = True assemblies[mol_id].residues = str(gb_record.seq) assemblies[mol_id].length = len(str(gb_record.seq)) current_assembly = assemblies[mol_id] # each feat is a SeqFeature object for feat in gb_record.features: #print(feat) fmin = int(feat.location.start) fmax = int(feat.location.end) if feat.location.strand == 1: strand = '+' elif feat.location.strand == -1: strand = '-' else: raise Exception( "ERROR: unstranded feature encountered: {0}".format(feat)) #print("{0} located at {1}-{2} strand:{3}".format( locus_tag, fmin, fmax, strand ) ) if feat.type == 'source': continue if feat.type == 'gene': # print the previous gene (if there is one) if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') locus_tag = feat.qualifiers['locus_tag'][0] gene = things.Gene(id=locus_tag, locus_tag=locus_tag) gene.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_gene = gene current_RNA = None elif feat.type == 'mRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.mRNA.{1}".format(locus_tag, rna_count_by_gene[locus_tag]) mRNA = things.mRNA(id=feat_id, parent=current_gene, locus_tag=locus_tag) mRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_mRNA(mRNA) current_RNA = mRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}". format(feat_id)) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'tRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.tRNA.{1}".format(locus_tag, rna_count_by_gene[locus_tag]) if 'product' in feat.qualifiers: anticodon = feat.qualifiers['product'][0] else: anticodon = None tRNA = things.tRNA(id=feat_id, parent=current_gene, anticodon=anticodon) tRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_tRNA(tRNA) current_RNA = tRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}". format(feat_id)) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'rRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.rRNA.{1}".format(locus_tag, rna_count_by_gene[locus_tag]) if 'product' in feat.qualifiers: product = feat.qualifiers['product'][0] else: product = None annot = annotation.FunctionalAnnotation(product_name=product) rRNA = things.rRNA(id=feat_id, parent=current_gene, annotation=annot) rRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_rRNA(rRNA) current_RNA = rRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}". format(feat_id)) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'CDS': locus_tag = feat.qualifiers['locus_tag'][0] # If processing a prokaryotic GBK, we'll encounter CDS before mRNA, so we have to # manually make one if current_RNA is None: feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag]) mRNA = things.mRNA(id=feat_id, parent=current_gene) mRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_mRNA(mRNA) current_RNA = mRNA if 'product' in feat.qualifiers: product = feat.qualifiers['product'][0] else: product = None if 'gene' in feat.qualifiers: gene_symbol = feat.qualifiers['gene'][0] else: gene_symbol = None annot = annotation.FunctionalAnnotation( product_name=product, gene_symbol=gene_symbol) if 'db_xref' in feat.qualifiers: for dbxref in feat.qualifiers['db_xref']: annot.add_dbxref(dbxref) polypeptide_id = "{0}.polypeptide.{1}".format( locus_tag, rna_count_by_gene[locus_tag]) polypeptide = things.Polypeptide(id=polypeptide_id, parent=mRNA, annotation=annot) mRNA.add_polypeptide(polypeptide) exon_count_by_RNA[current_RNA.id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id]) current_CDS_phase = 0 for loc in feat.location.parts: subfmin = int(loc.start) subfmax = int(loc.end) CDS = things.CDS(id=cds_id, parent=current_RNA) CDS.locate_on(target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand, phase=current_CDS_phase) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) # 0 + 6 = 0 TTGCAT # 0 + 7 = 2 TTGCATG # 1 + 6 = 1 TTGCAT # 2 + 7 = 1 TTGCATG # general: 3 - ((length - previous phase) % 3) current_CDS_phase = 3 - (( (subfmax - subfmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id]) exon = things.Exon(id=exon_id, parent=current_RNA) exon.locate_on(target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand) current_RNA.add_exon(exon) exon_count_by_RNA[current_RNA.id] += 1 else: print( "WARNING: The following feature was skipped:\n{0}".format( feat)) features_skipped_count += 1 # don't forget to do the last gene, if there were any if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') if args.fasta is True: if seqs_pending_writes is True: ofh.write("##FASTA\n") for assembly_id in assemblies: ofh.write(">{0}\n".format(assembly_id)) ofh.write("{0}\n".format( utils.wrapped_fasta(assemblies[assembly_id].residues))) if features_skipped_count > 0: print("Warning: {0} unsupported feature types were skipped".format( features_skipped_count))
def add_aragorn_features(assemblies, features, aragorn_file): current_assembly_id = None for line in open(aragorn_file): line = line.rstrip() if line.startswith('>'): m = re.match('>(\S+)', line) current_assembly_id = m.group(1) if current_assembly_id not in assemblies: assembly = things.Assembly(id=current_assembly_id, residues='') assemblies[current_assembly_id] = assembly else: cols = line.split() if len(cols) == 5: if cols[1].startswith('tRNA'): feat_type = 'tRNA' elif cols[1].startswith('tmRNA'): feat_type = 'tmRNA' elif cols[1].startswith('mtRNA'): feat_type = 'mtRNA' else: raise Exception("Unexpected type in ARAGORN, column value: {0}".format(cols[1])) feat_base = "{0}_{1}".format(feat_type, uuid.uuid4()) gene_id = "{0}_gene".format(feat_base) RNA_id = "{0}_{1}".format(feat_base, feat_type) m = re.match('(c*)\[(\d+),(\d+)\]', cols[2]) if m: rfmin = int(m.group(2)) - 1 rfmax = int(m.group(3)) # For predictions spanning the origin of circular molecules, fmax needs to be adjusted # https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md # Sub-heading: Circular Genomes # # Example input lines: # 62 tRNA-Met c[2764659,18] 29 (cat) # 1 tRNA-Ile [2697442,47] 35 (gat) if rfmax < rfmin: assemblies[current_assembly_id].is_circular = True rfmax = rfmin + rfmax + 1 if m.group(1): rstrand = -1 else: rstrand = 1 else: raise Exception("ERROR: unexpected coordinate format: {0}".format(cols[2])) current_assembly = assemblies[current_assembly_id] gene = things.Gene(id=gene_id) gene.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) features[gene_id] = gene current_assembly.add_gene(gene) if feat_type == 'tRNA': RNA = things.tRNA(id=RNA_id, parent=gene, anticodon=cols[4][1:4].upper()) gene.add_tRNA(RNA) else: RNA = things.tmRNA(id=RNA_id, parent=gene) gene.add_tmRNA(RNA) RNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) RNA.annotation = annotation.FunctionalAnnotation(product_name=cols[1]) features[RNA_id] = RNA