def main(): parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF') parser.add_argument('-p', '--print_n_with_stops', type=int, required=False, default=0, help='Optional. Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' ) parser.add_argument('-o', '--output_fasta', type=str, required=False, help='Optional. Writes an output (translated) FASTA file for all those features which had internal stops') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_stops = 0 # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it. debug_mRNA = None fasta_out_fh = None if args.output_fasta is not None: fasta_out_fh = open(args.output_fasta, 'wt') for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 if debug_mRNA is not None and mRNA.id == debug_mRNA: print("CDS:{0}".format(coding_seq)) if biocodeutils.translate(coding_seq).rstrip('*').count('*') > 0: mRNAs_with_stops += 1 translated_seq = biocodeutils.translate(coding_seq) if fasta_out_fh is not None: loc = mRNA.location_on(assemblies[assembly_id]) fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand) ) fasta_out_fh.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq))) if debug_mRNA is not None and mRNA.id == debug_mRNA: print("TRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) ) if mRNAs_with_stops <= args.print_n_with_stops: print("\nmRNA id: {0}".format(mRNA.id) ) print("\tCDS:{0}".format(coding_seq)) print("\tTRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) ) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
def check_and_update_phase(CDS): loc = CDS.location() CDS.get_residues() best_phase = None orig_phase_stop_count = None best_phase_stop_count = None for phase in [0, 1, 2]: protein_seq = biocodeutils.translate(CDS.residues[phase:]).rstrip('*') stop_count = protein_seq.count('*') if phase == loc.phase: orig_phase_stop_count = stop_count if best_phase is None or stop_count < best_phase_stop_count: best_phase = phase best_phase_stop_count = stop_count continue if best_phase != loc.phase: print("INFO: CDS {0} at coordinate:{1}, phase:{2} had {3} stops. Updating to phase:{4} which had {5}".format( \ CDS.id, loc.fmin, loc.phase, orig_phase_stop_count, best_phase, best_phase_stop_count) ) loc.phase = best_phase
def check_and_update_phase(CDS): loc = CDS.location() CDS.get_residues() best_phase = None orig_phase_stop_count = None best_phase_stop_count = None for phase in [ 0, 1, 2 ]: protein_seq = biocodeutils.translate(CDS.residues[phase:]).rstrip('*') stop_count = protein_seq.count('*') if phase == loc.phase: orig_phase_stop_count = stop_count if best_phase is None or stop_count < best_phase_stop_count: best_phase = phase best_phase_stop_count = stop_count continue if best_phase != loc.phase: print("INFO: CDS {0} at coordinate:{1}, phase:{2} had {3} stops. Updating to phase:{4} which had {5}".format( \ CDS.id, loc.fmin, loc.phase, orig_phase_stop_count, best_phase, best_phase_stop_count) ) loc.phase = best_phase
def main(): parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created') parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export') parser.add_argument( '-f', '--fasta', type=str, required=False, help= 'If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' ) parser.add_argument('--check_ends', dest='check_ends', action='store_true') parser.set_defaults(check_ends=False) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # only doing the standard codon table for now start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] ## add sequence residues from external FASTA file if the user passed one if args.fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): ## initial values of id and header to export (can be overridden by available annotation) export_id = mRNA.id export_header = None if mRNA.locus_tag is not None: export_id = mRNA.locus_tag ## Add the gene product name if there is one for polypeptide in mRNA.polypeptides(): if polypeptide.annotation is not None: if polypeptide.annotation.product_name is not None: export_header = polypeptide.annotation.product_name break fout.write(">{0}".format(export_id)) if export_header is not None: fout.write(" {0}\n".format(export_header)) else: fout.write("\n") coding_seq = mRNA.get_CDS_residues() if args.check_ends == True: # check the starting codon start_codon = coding_seq[0:3].upper() if start_codon not in start_codons: sys.stderr.write( "WARN: Non-canonical start codon ({0}) in mRNA {1}\n" .format(start_codon, mRNA.id)) stop_codon = coding_seq[-3:].upper() if stop_codon not in stop_codons: sys.stderr.write( "WARN: Non-canonical stop codon ({0}) in mRNA {1}\n" .format(stop_codon, mRNA.id)) if args.type == 'cds': fout.write("{0}\n".format( biocodeutils.wrapped_fasta(coding_seq))) else: translated_seq = biocodeutils.translate(coding_seq) fout.write("{0}\n".format( biocodeutils.wrapped_fasta(translated_seq)))
def main(): parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created' ) parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export') parser.add_argument('-f', '--fasta', type=str, required=False, help='If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' ) parser.add_argument('--check_ends', dest='check_ends', action='store_true') parser.set_defaults(check_ends=False) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # only doing the standard codon table for now start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] ## add sequence residues from external FASTA file if the user passed one if args.fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): ## initial values of id and header to export (can be overridden by available annotation) export_id = mRNA.id export_header = None if mRNA.locus_tag is not None: export_id = mRNA.locus_tag ## Add the gene product name if there is one for polypeptide in mRNA.polypeptides(): if polypeptide.annotation is not None: if polypeptide.annotation.product_name is not None: export_header = polypeptide.annotation.product_name break fout.write(">{0}".format(export_id)) if export_header is not None: fout.write(" {0}\n".format(export_header)) else: fout.write("\n") coding_seq = mRNA.get_CDS_residues(for_translation=True) if args.check_ends == True: # check the starting codon start_codon = coding_seq[0:3].upper() if start_codon not in start_codons: sys.stderr.write("WARN: Non-canonical start codon ({0}) in mRNA {1}\n".format(start_codon, mRNA.id)) stop_codon = coding_seq[-3:].upper() if stop_codon not in stop_codons: sys.stderr.write("WARN: Non-canonical stop codon ({0}) in mRNA {1}\n".format(stop_codon, mRNA.id)) if args.type == 'cds': fout.write("{0}\n".format(biocodeutils.wrapped_fasta(coding_seq))) else: translated_seq = biocodeutils.translate(coding_seq) fout.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq)))
def main(): parser = argparse.ArgumentParser( description= 'Checks the CDS features against a genome sequence to report/correct phase columns.' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-o', '--output_gff', type=str, required=False, help= 'Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_terminal_stops = 0 stop_codons = ['TAG', 'TAA', 'TGA'] for assembly_id in assemblies: print("Assembly {0} has length {1}".format( assembly_id, assemblies[assembly_id].length)) for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 translation = biocodeutils.translate(coding_seq) if translation.endswith('*'): mRNAs_with_terminal_stops += 1 else: print("gene:{1}, mRNA: {0} is missing a stop".format( mRNA.id, gene.id)) mRNA_loc = mRNA.location_on(assemblies[assembly_id]) CDSs = sorted(mRNA.CDSs()) codon_step_size = 3 if mRNA_loc.strand == 1: CDS_pos = CDSs[-1].location_on( assemblies[assembly_id]).fmax mRNA_limit = mRNA_loc.fmax else: CDS_pos = CDSs[0].location_on( assemblies[assembly_id]).fmin mRNA_limit = mRNA_loc.fmin codon_step_size = -3 print("\tmRNA:{0}-{1}, CDS end: {2}\n\tExtending".format( mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos), end='') new_stop_found = False # We have to step backwards to start if on the reverse strand if codon_step_size < 0: CDS_pos += codon_step_size while True: if (codon_step_size < 0 and CDS_pos < mRNA_limit) or ( codon_step_size > 0 and CDS_pos > mRNA_limit): print(" Reached the mRNA limit") break else: next_codon = assemblies[assembly_id].residues[ CDS_pos:CDS_pos + 3] print(".{0}({1})".format(next_codon, CDS_pos), end='') if next_codon in stop_codons: new_stop_found = True print(" Found a stop") break CDS_pos += codon_step_size if new_stop_found == True: print("\tCDS_pos: UPDATE: {0}".format(CDS_pos)) else: print("\tCDS_pos: SAME: {0}".format(CDS_pos)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs with terminal stops: {0}".format(mRNAs_with_terminal_stops))
def print_biogene(gene=None, fh=None, on=None): ''' This method accepts a Gene object located on an Assembly object (from biothings.py) and prints the feature graph for that gene in Genbank flat file format, including the gene, RNA and CDS ''' if gene is None: raise Exception( "ERROR: The print_biogene() function requires a biogene to be passed via the 'gene' argument" ) ## we can auto-detect the molecule if the user didn't pass one # and if there's only one. if on is None: on = gene.location().on gene_loc = gene.location_on(on) gene_start = gene_loc.fmin + 1 gene_stop = gene_loc.fmax # area to hack if you want to set default values, for debugging #gene.locus_tag = 'Tparva_0000002' if gene_loc.strand == 1: fh.write(" gene {0}..{1}\n".format( gene_start, gene_stop)) else: fh.write(" gene complement({0}..{1})\n".format( gene_start, gene_stop)) if gene.locus_tag is None: sys.stderr.write("WARNING: No locus_tag found on gene {0}\n".format( gene.id)) else: fh.write(" /locus_tag=\"{0}\"\n".format( gene.locus_tag)) for mRNA in sorted(gene.mRNAs()): mRNA_loc = mRNA.location_on(on) ########################### ## write the mRNA feature (made up of exon fragments) mRNA_loc_segments = list() for exon in sorted(mRNA.exons()): exon_loc = exon.location_on(on) mRNA_loc_segments.append([exon_loc.fmin + 1, exon_loc.fmax]) mRNA_loc_string = segments_to_string(mRNA_loc_segments) if mRNA_loc.strand == 1: fh.write(" mRNA {0}\n".format(mRNA_loc_string)) else: fh.write(" mRNA complement({0})\n".format( mRNA_loc_string)) # Handle the locus tag, but we've already warned if not present on the gene, so don't # do it again here. if gene.locus_tag is not None: fh.write(" /locus_tag=\"{0}\"\n".format( gene.locus_tag)) if mRNA.annotation is not None: # debug: You can try out some annotation defaults for printing here mRNA.annotation.product_name = "Hypothetical protein" if mRNA.annotation.product_name is not None: fh.write(" /product=\"{0}\"\n".format( mRNA.annotation.product_name)) ########################### ## write the CDS feature (made up of CDS fragments) cds_loc_segments = list() if len(mRNA.CDSs()) < 1: raise Exception( "ERROR: Encountered an mRNA ({0}) without an CDS children". format(mRNA.id)) for cds in sorted(mRNA.CDSs()): cds_loc = cds.location_on(on) cds_loc_segments.append([cds_loc.fmin + 1, cds_loc.fmax]) cds_loc_string = segments_to_string(cds_loc_segments) if cds_loc.strand == 1: fh.write(" CDS {0}\n".format(cds_loc_string)) else: fh.write(" CDS complement({0})\n".format( cds_loc_string)) # Handle the locus tag, but we've already warned if not present on the gene, so don't # do it again here. if gene.locus_tag is not None: fh.write(" /locus_tag=\"{0}\"\n".format( gene.locus_tag)) ## if there is annotation on the polypeptide, include it here polypeptides = mRNA.polypeptides() if len(polypeptides) == 1 and polypeptides[0].annotation is not None: annot = polypeptides[0].annotation if annot.product_name is not None: fh.write(" /product=\"{0}\"\n".format( annot.product_name)) if len(annot.ec_numbers) > 0: for ec_num in annot.ec_numbers: fh.write( " /EC_number=\"{0}\"\n".format( ec_num.number)) if len(annot.go_annotations) > 0: for go_annot in annot.go_annotations: fh.write( " /db_xref=\"GO:{0}\"\n".format( go_annot.go_id)) cds_residues = mRNA.get_CDS_residues() polypeptide_residues = biocodeutils.translate(cds_residues) if len(polypeptide_residues) > 0: # This is the easiest case first, where no wrapping is needed. if len(polypeptide_residues) < MAX_FTABLE_CONTENT_WIDTH - 15: fh.write(" /translation=\"{0}\"\n".format( polypeptide_residues)) else: # If we get here, we must wrap fh.write(" /translation=\"{0}\n".format( polypeptide_residues[0:MAX_FTABLE_CONTENT_WIDTH - 14])) remaining = polypeptide_residues[MAX_FTABLE_CONTENT_WIDTH - 14:] closing_parens_written = False while len(remaining) > 0: if len(remaining) > MAX_FTABLE_CONTENT_WIDTH - 1: fh.write(" {0}\n".format( remaining[0:MAX_FTABLE_CONTENT_WIDTH])) remaining = remaining[MAX_FTABLE_CONTENT_WIDTH:] else: fh.write( " {0}\"\n".format(remaining)) remaining = "" closing_parens_written = True if closing_parens_written == False: # G675_02159 fh.write(" \"\n")
def print_biogene( gene=None, fh=None, on=None ): ''' This method accepts a Gene object located on an Assembly object (from biothings.py) and prints the feature graph for that gene in Genbank flat file format, including the gene, RNA and CDS ''' if gene is None: raise Exception( "ERROR: The print_biogene() function requires a biogene to be passed via the 'gene' argument" ); ## we can auto-detect the molecule if the user didn't pass one # and if there's only one. if on is None: on = gene.location().on gene_loc = gene.location_on( on ) gene_start = gene_loc.fmin + 1 gene_stop = gene_loc.fmax # area to hack if you want to set default values, for debugging #gene.locus_tag = 'Tparva_0000002' if gene_loc.strand == 1: fh.write(" gene {0}..{1}\n".format(gene_start, gene_stop)) else: fh.write(" gene complement({0}..{1})\n".format(gene_start, gene_stop)) if gene.locus_tag is None: sys.stderr.write("WARNING: No locus_tag found on gene {0}\n".format(gene.id)) else: fh.write(" /locus_tag=\"{0}\"\n".format(gene.locus_tag)) for mRNA in sorted(gene.mRNAs()): mRNA_loc = mRNA.location_on( on ) ########################### ## write the mRNA feature (made up of exon fragments) mRNA_loc_segments = list() for exon in sorted(mRNA.exons()): exon_loc = exon.location_on(on) mRNA_loc_segments.append( [exon_loc.fmin + 1, exon_loc.fmax] ) mRNA_loc_string = segments_to_string(mRNA_loc_segments) if mRNA_loc.strand == 1: fh.write(" mRNA {0}\n".format(mRNA_loc_string)) else: fh.write(" mRNA complement({0})\n".format(mRNA_loc_string)) # Handle the locus tag, but we've already warned if not present on the gene, so don't # do it again here. if gene.locus_tag is not None: fh.write(" /locus_tag=\"{0}\"\n".format(gene.locus_tag)) if mRNA.annotation is not None: # debug: You can try out some annotation defaults for printing here mRNA.annotation.product_name = "Hypothetical protein" if mRNA.annotation.product_name is not None: fh.write(" /product=\"{0}\"\n".format(mRNA.annotation.product_name)) ########################### ## write the CDS feature (made up of CDS fragments) cds_loc_segments = list() if len(mRNA.CDSs()) < 1: raise Exception("ERROR: Encountered an mRNA ({0}) without an CDS children".format(mRNA.id)) for cds in sorted(mRNA.CDSs()): cds_loc = cds.location_on(on) cds_loc_segments.append( [cds_loc.fmin + 1, cds_loc.fmax] ) cds_loc_string = segments_to_string(cds_loc_segments) if cds_loc.strand == 1: fh.write(" CDS {0}\n".format(cds_loc_string)) else: fh.write(" CDS complement({0})\n".format(cds_loc_string)) # Handle the locus tag, but we've already warned if not present on the gene, so don't # do it again here. if gene.locus_tag is not None: fh.write(" /locus_tag=\"{0}\"\n".format(gene.locus_tag)) ## if there is annotation on the polypeptide, include it here polypeptides = mRNA.polypeptides() if len(polypeptides) == 1 and polypeptides[0].annotation is not None: annot = polypeptides[0].annotation if annot.product_name is not None: fh.write(" /product=\"{0}\"\n".format(annot.product_name)) if len(annot.ec_numbers) > 0: for ec_num in annot.ec_numbers: fh.write(" /EC_number=\"{0}\"\n".format(ec_num.number)) if len(annot.go_annotations) > 0: for go_annot in annot.go_annotations: fh.write(" /db_xref=\"GO:{0}\"\n".format(go_annot.go_id)) cds_residues = mRNA.get_CDS_residues() polypeptide_residues = biocodeutils.translate(cds_residues) if len(polypeptide_residues) > 0: # This is the easiest case first, where no wrapping is needed. if len(polypeptide_residues) < MAX_FTABLE_CONTENT_WIDTH - 15: fh.write(" /translation=\"{0}\"\n".format(polypeptide_residues)) else: # If we get here, we must wrap fh.write(" /translation=\"{0}\n".format(polypeptide_residues[0:MAX_FTABLE_CONTENT_WIDTH - 14])) remaining = polypeptide_residues[MAX_FTABLE_CONTENT_WIDTH - 14:] closing_parens_written = False while len(remaining) > 0: if len(remaining) > MAX_FTABLE_CONTENT_WIDTH - 1: fh.write(" {0}\n".format(remaining[0:MAX_FTABLE_CONTENT_WIDTH])) remaining = remaining[MAX_FTABLE_CONTENT_WIDTH:] else: fh.write(" {0}\"\n".format(remaining)) remaining = "" closing_parens_written = True if closing_parens_written == False: # G675_02159 fh.write(" \"\n")
def main(): parser = argparse.ArgumentParser( description= 'Checks the CDS features against a genome sequence report non-terminal internal stops.' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-p', '--print_n_with_stops', type=int, required=False, default=0, help= 'Optional. Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' ) parser.add_argument( '-o', '--output_fasta', type=str, required=False, help= 'Optional. Writes an output (translated) FASTA file for all those features which had internal stops' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_stops = 0 # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it. debug_mRNA = None fasta_out_fh = None if args.output_fasta is not None: fasta_out_fh = open(args.output_fasta, 'wt') for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 if debug_mRNA is not None and mRNA.id == debug_mRNA: print("CDS:{0}".format(coding_seq)) if biocodeutils.translate(coding_seq).rstrip('*').count( '*') > 0: mRNAs_with_stops += 1 translated_seq = biocodeutils.translate(coding_seq) if fasta_out_fh is not None: loc = mRNA.location_on(assemblies[assembly_id]) fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format( mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand)) fasta_out_fh.write("{0}\n".format( biocodeutils.wrapped_fasta(translated_seq))) if debug_mRNA is not None and mRNA.id == debug_mRNA: print("TRANSLATION WITH STOP ({1}): {0}".format( translated_seq, mRNA.id)) if mRNAs_with_stops <= args.print_n_with_stops: print("\nmRNA id: {0}".format(mRNA.id)) print("\tCDS:{0}".format(coding_seq)) print("\tTRANSLATION WITH STOP ({1}): {0}".format( translated_seq, mRNA.id)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))