def main(): parser = argparse.ArgumentParser( description='Convert GenBank flat files to GFF3 format') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GBK file') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created') parser.add_argument( '--with_fasta', dest='fasta', action='store_true', help= 'Include the FASTA section with genomic sequence at end of file. (default)' ) parser.add_argument('--no_fasta', dest='fasta', action='store_false') parser.set_defaults(fasta=True) args = parser.parse_args() ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) seqs_pending_writes = False features_skipped_count = 0 # each gb_record is a SeqRecord object for gb_record in SeqIO.parse(open(args.input_file, "r"), "genbank"): mol_id = gb_record.name if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) if len(str(gb_record.seq)) > 0: seqs_pending_writes = True assemblies[mol_id].residues = str(gb_record.seq) assemblies[mol_id].length = len(str(gb_record.seq)) current_assembly = assemblies[mol_id] # each feat is a SeqFeature object for feat in gb_record.features: #print(feat) fmin = int(feat.location.start) fmax = int(feat.location.end) if feat.location.strand == 1: strand = '+' elif feat.location.strand == -1: strand = '-' else: raise Exception( "ERROR: unstranded feature encountered: {0}".format(feat)) #print("{0} located at {1}-{2} strand:{3}".format( locus_tag, fmin, fmax, strand ) ) if feat.type == 'source': continue if feat.type == 'gene': # print the previous gene (if there is one) if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') locus_tag = feat.qualifiers['locus_tag'][0] gene = things.Gene(id=locus_tag, locus_tag=locus_tag) gene.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_gene = gene current_RNA = None elif feat.type == 'mRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.mRNA.{1}".format(locus_tag, rna_count_by_gene[locus_tag]) mRNA = things.mRNA(id=feat_id, parent=current_gene, locus_tag=locus_tag) mRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_mRNA(mRNA) current_RNA = mRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}". format(feat_id)) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'tRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.tRNA.{1}".format(locus_tag, rna_count_by_gene[locus_tag]) if 'product' in feat.qualifiers: anticodon = feat.qualifiers['product'][0] else: anticodon = None tRNA = things.tRNA(id=feat_id, parent=current_gene, anticodon=anticodon) tRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_tRNA(tRNA) current_RNA = tRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}". format(feat_id)) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'rRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.rRNA.{1}".format(locus_tag, rna_count_by_gene[locus_tag]) if 'product' in feat.qualifiers: product = feat.qualifiers['product'][0] else: product = None annot = annotation.FunctionalAnnotation(product_name=product) rRNA = things.rRNA(id=feat_id, parent=current_gene, annotation=annot) rRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_rRNA(rRNA) current_RNA = rRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}". format(feat_id)) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'CDS': locus_tag = feat.qualifiers['locus_tag'][0] # If processing a prokaryotic GBK, we'll encounter CDS before mRNA, so we have to # manually make one if current_RNA is None: feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag]) mRNA = things.mRNA(id=feat_id, parent=current_gene) mRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_mRNA(mRNA) current_RNA = mRNA if 'product' in feat.qualifiers: product = feat.qualifiers['product'][0] else: product = None if 'gene' in feat.qualifiers: gene_symbol = feat.qualifiers['gene'][0] else: gene_symbol = None annot = annotation.FunctionalAnnotation( product_name=product, gene_symbol=gene_symbol) if 'db_xref' in feat.qualifiers: for dbxref in feat.qualifiers['db_xref']: annot.add_dbxref(dbxref) polypeptide_id = "{0}.polypeptide.{1}".format( locus_tag, rna_count_by_gene[locus_tag]) polypeptide = things.Polypeptide(id=polypeptide_id, parent=mRNA, annotation=annot) mRNA.add_polypeptide(polypeptide) exon_count_by_RNA[current_RNA.id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id]) current_CDS_phase = 0 for loc in feat.location.parts: subfmin = int(loc.start) subfmax = int(loc.end) CDS = things.CDS(id=cds_id, parent=current_RNA) CDS.locate_on(target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand, phase=current_CDS_phase) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) # 0 + 6 = 0 TTGCAT # 0 + 7 = 2 TTGCATG # 1 + 6 = 1 TTGCAT # 2 + 7 = 1 TTGCATG # general: 3 - ((length - previous phase) % 3) current_CDS_phase = 3 - (( (subfmax - subfmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id]) exon = things.Exon(id=exon_id, parent=current_RNA) exon.locate_on(target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand) current_RNA.add_exon(exon) exon_count_by_RNA[current_RNA.id] += 1 else: print( "WARNING: The following feature was skipped:\n{0}".format( feat)) features_skipped_count += 1 # don't forget to do the last gene, if there were any if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') if args.fasta is True: if seqs_pending_writes is True: ofh.write("##FASTA\n") for assembly_id in assemblies: ofh.write(">{0}\n".format(assembly_id)) ofh.write("{0}\n".format( utils.wrapped_fasta(assemblies[assembly_id].residues))) if features_skipped_count > 0: print("Warning: {0} unsupported feature types were skipped".format( features_skipped_count))
def main(): parser = argparse.ArgumentParser( description='Convert GFF output from Prodigal into GFF3 format') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Prodigal') parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created') args = parser.parse_args() assemblies = dict() current_assembly = None gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() ## Used for tracking the exon count for each gene (for ID purposes) exon_count_by_mRNA = dict() fout = open(args.output, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") for line in open(args.input): if line.startswith("#"): pass else: ## gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() ## cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_id = gff.column_9_value(cols[8], 'ID') ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] if feat_type == "CDS": # gene gene = things.Gene(id=feat_id) gene.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) # mRNA mRNA = things.mRNA(id=feat_id + '.t1', parent=gene) mRNA.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) gene.add_mRNA(mRNA) mRNAs[mRNA.id] = mRNA if feat_id in exon_count_by_mRNA: raise Exception( "ERROR: two different mRNAs found with same ID: {0}". format(feat_id)) else: exon_count_by_mRNA[feat_id + '.t1'] = 0 # CDS / exons parent_id = gff.column_9_value(cols[8], 'ID') + '.t1' ## sanity check that we've seen this parent if parent_id not in mRNAs: raise Exception( "ERROR: Found CDS column with parent ({0}) mRNA not yet in the file" .format(parent_id)) CDS = things.CDS(id=parent_id + '.cds', parent=mRNAs[parent_id]) CDS.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7])) mRNA.add_CDS(CDS) # exons weren't explicitly defined in the input file, so we need to derive new IDs for them exon_count_by_mRNA[parent_id] += 1 exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id]) exon = things.Exon(id=exon_id, parent=mRNAs[parent_id]) exon.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) mRNA.add_exon(exon) ## gene.print_as(fh=fout, source='Prodigal_v2.6.3', format='gff3')
def main(): parser = argparse.ArgumentParser( description='Converts CEGMA GFF output to spec-legal GFF3') # output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to parse') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') args = parser.parse_args() fout = open(args.output_file, 'w') fout.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_mRNA = None current_gene_fmin = None current_gene_fmax = None current_gene_strand = None next_id_nums = {'gene': 1, 'mRNA': 1, 'CDS': 1, 'exon': 1} exon_column_types = ['First', 'Internal', 'Terminal', 'Single'] for line in open(args.input_file, 'r'): if line.startswith('#'): continue cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_fmin = int(cols[3]) - 1 feat_fmax = int(cols[4]) if feat_type == 'Single' or feat_type == 'First': # If there's an existing gene already, print it out if current_gene is not None: current_gene.locate_on(target=current_assembly, fmin=current_gene_fmin, fmax=current_gene_fmax, strand=current_gene_strand) current_mRNA.locate_on(target=current_assembly, fmin=current_gene_fmin, fmax=current_gene_fmax, strand=current_gene_strand) #current_gene.print_as(format='text') current_gene.print_as(fh=fout, source='cegma', format='gff3') # initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] feat_id = "cegma.gene.{0}".format(next_id_nums['gene']) next_id_nums['gene'] += 1 gene = things.Gene(id=feat_id) current_gene = gene current_gene_strand = cols[6] current_gene_fmin = feat_fmin current_gene_fmax = feat_fmax mRNA_id = "cegma.mRNA.{0}".format(next_id_nums['mRNA']) next_id_nums['mRNA'] += 1 mRNA = things.mRNA(id=mRNA_id, parent=gene) gene.add_mRNA(mRNA) current_mRNA = mRNA # CEGMA versions < 2.5 had two rows for each exon. We don't need to process both of them, so # we skip the Exon one because its phase information is incorrect. if feat_type in exon_column_types: CDS_id = "cegma.CDS.{0}".format(next_id_nums['CDS']) next_id_nums['CDS'] += 1 CDS = things.CDS(id=CDS_id, parent=current_mRNA) CDS.locate_on(target=current_assembly, fmin=feat_fmin, fmax=feat_fmax, strand=cols[6], phase=cols[7]) current_mRNA.add_CDS(CDS) exon_id = "cegma.exon.{0}".format(next_id_nums['exon']) next_id_nums['exon'] += 1 exon = things.Exon(id=exon_id, parent=current_mRNA) exon.locate_on(target=current_assembly, fmin=feat_fmin, fmax=feat_fmax, strand=cols[6]) mRNA.add_exon(exon) if feat_fmin < current_gene_fmin: current_gene_fmin = feat_fmin if feat_fmax > current_gene_fmax: current_gene_fmax = feat_fmax # don't forget the last gene if current_gene is not None: current_gene.locate_on(target=current_assembly, fmin=current_gene_fmin, fmax=current_gene_fmax, strand=current_gene_strand) current_mRNA.locate_on(target=current_assembly, fmin=current_gene_fmin, fmax=current_gene_fmax, strand=current_gene_strand) current_gene.print_as(fh=fout, source='cegma', format='gff3')
def main(): parser = argparse.ArgumentParser( description='Convert native (GTF) or GFF output from Augustus into GFF3 format') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Augustus' ) parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' ) args = parser.parse_args() assemblies = dict() current_assembly = None gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() ## Used for tracking the exon count for each gene (for ID purposes) exon_count_by_mRNA = dict() fout = open(args.output, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") for line in open(args.input): if line.startswith("#"): current_gene_comment_lines.append(line) if line.startswith("# end gene "): ## purge the comments, then write the gene fout.write( "".join(current_gene_comment_lines) ) gene.print_as(fh=fout, source='AUGUSTUS', format='gff3') gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() elif line.startswith("# protein sequence = ["): pass elif in_sequence is True: # build 'current_sequence' pass else: cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] if feat_type not in ['gene', 'transcript', 'CDS']: continue ## The output format is GTF by default and (mostly) GFF if the --gff option is used. # If GTF is detected, let's start by transforming the 9th column into GFF so the # libraries can use it # g1 -> ID=g1 # g1.t1 -> ID=g1.t1;Parent=g1 # transcript_id "g1.t1"; gene_id "g1"; -> ID=g1.t1.cds;Parent=g1.t1 m_gene = re.match('(g\d+)', cols[8]) m_transcript = re.match('((g\d+).t\d+)', cols[8]) m_CDS = re.match('transcript_id "(g\d+.t\d+)"; gene_id "g\d+";', cols[8]) # the input can be in GTF or GFF. We need to reformat the 9th column for the GTF entries if not cols[8].startswith('ID') and not cols[8].startswith('Parent'): if feat_type == 'gene': if m_gene: cols[8] = "ID={0}".format(m_gene.group(1)) else: raise Exception("ERROR: GTF detected but gene row has bad 9th column format: {0}".format(cols[8])) elif feat_type == 'transcript': if m_transcript: cols[8] = "ID={0};Parent={1}".format(m_transcript.group(1), m_transcript.group(2)) else: raise Exception("ERROR: GTF detected but transcript row has bad 9th column format: {0}".format(cols[8])) elif feat_type == 'CDS': if m_CDS: cols[8] = "ID={0}.cds;Parent={0}".format(m_CDS.group(1)) else: raise Exception("ERROR: GTF detected but CDS row has bad 9th column format: {0}".format(cols[8])) feat_id = gff.column_9_value(cols[8], 'ID') ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] if feat_type == "gene": gene = things.Gene(id=feat_id) gene.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) elif feat_type == "transcript": mRNA = things.mRNA(id=feat_id, parent=gene) mRNA.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) gene.add_mRNA(mRNA) mRNAs[mRNA.id] = mRNA if feat_id in exon_count_by_mRNA: raise Exception( "ERROR: two different mRNAs found with same ID: {0}".format(feat_id) ) else: exon_count_by_mRNA[feat_id] = 0 elif feat_type == "CDS": parent_id = gff.column_9_value(cols[8], 'Parent') ## sanity check that we've seen this parent if parent_id not in mRNAs: raise Exception("ERROR: Found CDS column with parent ({0}) mRNA not yet in the file".format(parent_id)) CDS = things.CDS(id=feat_id, parent=mRNAs[parent_id]) CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]) ) mRNA.add_CDS(CDS) ## exons weren't explicitly defined in the input file, so we need to derive new IDs for them exon_count_by_mRNA[parent_id] += 1 exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id]) exon = things.Exon(id=exon_id, parent=mRNAs[parent_id]) exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) mRNA.add_exon(exon)
def main(): parser = argparse.ArgumentParser( description='A GTF -> GFF3 conversion script for StringTie output') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GTF file' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' ) args = parser.parse_args() ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None current_match = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) for line in open(args.input_file, "r"): cols = line.split("\t") if len(cols) != 9: print("SKIPPING: {0}".format(line)) continue mol_id = cols[0] if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] ftype = cols[2] fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] col9 = cols[8] # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key) col9 = col9.replace(' "', '="') gene_id = gff.column_9_value(col9, 'gene_id').replace('"', '') transcript_id = gff.column_9_value(col9, 'transcript_id').replace('"', '') cov = gff.column_9_value(col9, 'cov').replace('"', '') if ftype == 'transcript': if current_gene is not None and current_gene.id != gene_id: gene.print_as(fh=ofh, source='StringTie', format='gff3') if current_gene is None or current_gene.id != gene_id: gene = things.Gene(id=gene_id) gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_gene = gene mRNA = things.mRNA(id=transcript_id, parent=current_gene) mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_mRNA(mRNA) current_RNA = mRNA exon_count_by_RNA[transcript_id] = 0 current_CDS_phase = 0 elif ftype == 'exon': exon_number = gff.column_9_value(col9, 'exon_number').replace('"', '') exon_count_by_RNA[transcript_id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) CDS = things.CDS(id=cds_id, parent=current_RNA) CDS.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand, phase=current_CDS_phase ) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) current_CDS_phase = 3 - (((fmax - fmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) exon = things.Exon(id=exon_id, parent=current_RNA) exon.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_RNA.add_exon(exon) # don't forget to do the last gene, if there were any if current_gene is not None: gene.print_as(fh=ofh, source='StringTie', format='gff3')
def main(): parser = argparse.ArgumentParser( description='Metagenemark GFF -> GFF3 conversion script') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Metagenemark' ) parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-p', '--prefix', type=str, required=True, help='Prefix to use in ID generation') parser.add_argument('-pf', '--protein_fasta', type=str, required=False, help='Optional protein FASTA to be written') args = parser.parse_args() assemblies = dict() current_assembly = None # key like 2 = SRS014890.polypeptide.2 polypeptide_lookup = dict() writing_protein = False gene = None mRNAs = dict() current_sequence = None current_gene_comment_lines = list() fout = open(args.output, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") if args.protein_fasta is not None: protein_out = open(args.protein_fasta, mode='wt', encoding='utf-8') for line in open(args.input): if line.startswith("#"): if line.startswith("##FASTA"): current_gene_comment_lines.append("#{0}".format(line)) elif line.startswith("##end-Protein"): writing_protein = False current_gene_comment_lines.append(line) # since we're already doing our own header, don't duplicate the old one elif line.startswith("##gff-version"): continue else: if line.startswith("##Protein "): m = re.match("##Protein (\d+)", line) if m: writing_protein = True protein_out.write(">{0}\n".format(polypeptide_lookup[m.group(1)])) else: raise Exception("ERROR: Expected line to match: ##Protein N") elif writing_protein == True: protein_out.write(line[2:]) current_gene_comment_lines.append(line) else: cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] mol_id_m = re.match('^(\S+) ', mol_id) if mol_id_m: print("MATCH!") mol_id = mol_id_m.group(1) feat_type = cols[2] ## we expect only gene types here if feat_type not in ['gene', 'CDS']: raise Exception("ERROR: expected only 'gene' or 'CDS' feature types as input (depending on metagenemark version).") m_gene = re.match('gene_id[ =](\d+)', cols[8]) if m_gene: gene_num = m_gene.group(1) else: raise Exception("ERROR: expected 9th column to have gene ids like: gene_id 5") ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] gene = things.Gene(id="{0}.gene.{1}".format(args.prefix, gene_num)) gene.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) mRNA = things.mRNA(id="{0}.mRNA.{1}".format(args.prefix, gene_num), parent=gene.id) mRNA.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) gene.add_mRNA(mRNA) CDS = things.CDS(id="{0}.CDS.{1}".format(args.prefix, gene_num), parent=mRNA.id) CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]) ) mRNA.add_CDS(CDS) exon = things.Exon(id="{0}.exon.{1}".format(args.prefix, gene_num), parent=mRNA.id) exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) mRNA.add_exon(exon) polypeptide_id = "{0}.polypeptide.{1}".format(args.prefix, gene_num) polypeptide = things.Polypeptide(id=polypeptide_id, parent=mRNA.id) polypeptide.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) mRNA.add_polypeptide(polypeptide) polypeptide_lookup[gene_num] = polypeptide_id gene.print_as(fh=fout, source='GeneMark.hmm', format='gff3') fout.write( "".join(current_gene_comment_lines) ) current_gene_comment_lines = list()
def parse_annotation_line(line, genes, molecules): cols = line.split("\t") if len(cols) != 10: print( "WARNING: Ignoring the following line because I expected 10 columns:\n{0}" .format(line)) return False cols[9] = cols[9].rstrip() transcript_id = cols[0] CDS_id = cols[1] gene_id = get_gene_id_from_transcript(transcript_id) if cols[5] is None: gene_product_name = cols[3] else: gene_product_name = cols[5] if transcript_id not in molecules: raise Exception( "ERROR: found molecule {0} in referenced in annotation tab file but not in genomic_fasta file" .format(transcript_id)) if gene_id in genes: gene = genes[gene_id] else: gene = things.Gene(id=gene_id) genes[gene_id] = gene mRNA = things.mRNA(id=transcript_id) gene.add_mRNA(mRNA) annotation = annotation.FunctionalAnnotation( product_name=gene_product_name) ec_num_pattern = re.compile('\d+.') if cols[9] is not None: ec_nums = cols[9].split(',') for ec_num in ec_nums: m = ec_num_pattern.search(ec_num) if m: ec = annotation.ECAnnotation(number=ec_num) annotation.add_ec_number(ec) go_pattern = re.compile('(\d+)') if cols[8] is not None: go_terms = cols[8].split(',') for go_term in go_terms: m = go_pattern.search(go_term) if m: go = annotation.GOAnnotation(go_id=go_term) annotation.add_go_annotation(go) CDS = things.CDS(id=CDS_id, annotation=annotation) mRNA.add_CDS(CDS)
def main(): parser = argparse.ArgumentParser( description='A GTF -> GFF3 conversion script for Cufflinks output') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GTF file') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created') parser.add_argument('-e', '--export_mode', type=str, required=False, default='model', help='Export mode for results (model or cDNA_match)') args = parser.parse_args() if args.export_mode not in ['model', 'cDNA_match']: raise Exception( "ERROR: the only valid values for --export_mode are 'model' or 'cDNA_match'" ) ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None current_match = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) # each gb_record is a SeqRecord object for line in open(args.input_file, "r"): cols = line.split("\t") if len(cols) != 9: print("SKIPPING: {0}".format(line)) continue mol_id = cols[0] if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] ftype = cols[2] fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] col9 = cols[8] # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key) col9 = col9.replace(' "', '="') gene_id = gff.column_9_value(col9, 'gene_id').replace('"', '') transcript_id = gff.column_9_value(col9, 'transcript_id').replace('"', '') if ftype == 'transcript': if args.export_mode == 'model': if current_gene is not None and current_gene.id != gene_id: gene.print_as(fh=ofh, source='Cufflinks', format='gff3') if current_gene is None or current_gene.id != gene_id: gene = things.Gene(id=gene_id) gene.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_gene = gene mRNA = things.mRNA(id=transcript_id, parent=current_gene) mRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_mRNA(mRNA) current_RNA = mRNA exon_count_by_RNA[transcript_id] = 0 current_CDS_phase = 0 elif args.export_mode == 'cDNA_match': if current_match is not None and current_match.id != transcript_id: match.print_as(fh=ofh, source='Cufflinks', format='gff3') match = things.Match(id=transcript_id, subclass='cDNA_match', length=fmax - fmin) match.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_match = match elif ftype == 'exon': exon_number = gff.column_9_value(col9, 'exon_number').replace('"', '') if args.export_mode == 'model': exon_count_by_RNA[transcript_id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id]) CDS = things.CDS(id=cds_id, parent=current_RNA) CDS.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand, phase=current_CDS_phase) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) current_CDS_phase = 3 - (( (fmax - fmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id]) exon = things.Exon(id=exon_id, parent=current_RNA) exon.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_RNA.add_exon(exon) elif args.export_mode == 'cDNA_match': mp_id = "{0}.match_part.{1}".format(transcript_id, exon_number) mp = things.MatchPart(id=mp_id, parent=current_match, length=fmax - fmin) mp.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_match.add_part(mp) # don't forget to do the last gene, if there were any if args.export_mode == 'model': if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') elif args.export_mode == 'cDNA_match': if current_match is not None: match.print_as(fh=ofh, source='Cufflinks', format='gff3')