def main(): parser = argparse.ArgumentParser( description='Reports statistics of reference gene coverage and extension by aligned RNA-seq transcript data.') ## output file to be written parser.add_argument('-r', '--reference_file', type=str, required=True, help='GFF3 file of a reference annotation' ) parser.add_argument('-q', '--alignment_file', type=str, required=True, help='GFF3 file with RNA-seq assembly transcript features aligned to the same reference genome. Usually with something like GMAP.' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) args = parser.parse_args() (ref_assemblies, ref_features) = biocodegff.get_gff3_features( args.reference_file ) (qry_assemblies, qry_features) = biocodegff.get_gff3_features( args.alignment_file )
def main(): parser = argparse.ArgumentParser( description= 'Checks for genes with multiple mRNA children and creates new genes for each.' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") for assembly_id in assemblies: current_assembly = assemblies[assembly_id] for gene in assemblies[assembly_id].genes(): rnas_found = 0 mRNAs = gene.mRNAs() for mRNA in mRNAs: mRNA_loc = mRNA.location_on(current_assembly) rnas_found += 1 if rnas_found > 1: gene.remove_mRNA(mRNA) print("INFO: splitting mRNA off gene {0}".format(gene.id)) new_gene = biothings.Gene( id="{0}_{1}".format(gene.id, rnas_found)) new_gene.locate_on(target=current_assembly, fmin=mRNA_loc.fmin, fmax=mRNA_loc.fmax, strand=mRNA_loc.strand) new_gene.add_RNA(mRNA) new_gene.print_as(fh=ofh, format='gff3') if len(mRNAs) > 1: gene_loc = gene.location_on(current_assembly) mRNA_loc = mRNAs[0].location_on(current_assembly) gene_loc.fmin = mRNA_loc.fmin gene_loc.fmax = mRNA_loc.fmax gene_loc.strand = mRNA_loc.strand gene.print_as(fh=ofh, format='gff3')
def main(): parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF') parser.add_argument('-s', '--source', type=str, required=False, default='.', help='Optional. Sets the value for column 2 in all rows. Default = .' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) fout = open(args.output_file, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") # deal with the FASTA file if the user passed one if args.genome_fasta is not None: process_assembly_fasta(assemblies, args.genome_fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): for CDS in mRNA.CDSs(): check_and_update_phase(CDS) gene.print_as(fh=fout, source=args.source, format='gff3')
def main(): parser = argparse.ArgumentParser( description='Splits all GFF3 mRNA isoforms into their own gene models') ## Get the variables parser.add_argument('-i', '--input_file', type=str, required=True, help='Input GFF3 file' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Output GFF3 file' ) args = parser.parse_args() ofh = open(args.output_file, 'wt') print("INFO: Parsing GFF3 features\n") (assemblies, ref_features) = biocodegff.get_gff3_features( args.input_file ) print("INFO: Finding genes with isoforms and splitting them\n") ofh.write("##gff-version 3\n") for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): # only changing the gene features with isoforms if len(gene.mRNAs()) > 1: counter = 1 for mRNA in gene.mRNAs(): new_gene_id = str(gene.id) + "_" + str(counter) counter += 1 mRNA_loc = mRNA.location() print("Splitting " + gene.id) # create a new gene model, correcting the gene coords to the mRNA coords new_gene = biothings.Gene( id = new_gene_id) new_gene.locate_on( target=assemblies[assembly_id], fmin=mRNA_loc.fmin, fmax=mRNA_loc.fmax, strand=mRNA_loc.strand ) mRNA.parent.id = new_gene_id #Now add the mRNA to the gene model new_gene.add_mRNA(mRNA) # print out the new gene model new_gene.print_as(fh=ofh, source='IGS', format='gff3') else: gene.print_as(fh=ofh, source='IGS', format='gff3')
def main(): parser = argparse.ArgumentParser( description='Split an annotation GFF3 into training and evaluation sets') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' ) parser.add_argument('-ot', '--output_training_file', type=str, required=True, help='GFF3 file to be created with the training genes' ) parser.add_argument('-oe', '--output_evaluation_file', type=str, required=True, help='GFF3 file to be created with the evaluation genes' ) parser.add_argument('-ts', '--training_set_size', type=int, required=False, default=200, help='Number of transcripts to select for training' ) parser.add_argument('-es', '--evaluation_set_size', type=int, required=False, default=100, help='Number of transcripts to select for evaluation' ) parser.add_argument('-me', '--max_exon_count', type=int, required=False, help='Skips any mRNAs with more exons than this' ) parser.add_argument('--retain_composition', dest='retain_composition',action='store_true') parser.add_argument('--no_retain_composition', dest='retain_composition',action='store_false') parser.set_defaults(retain_composition=False) args = parser.parse_args() if args.retain_composition is True: raise Exception("ERROR: --retain_composition option not yet implemented") (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) # key: exon count, value = list of mRNA objects with that count # which of these gets used depends on whether --retain_composition is passed mRNAs_by_exon_count = defaultdict(lambda: list()) mRNAs = list() mRNA_count = 0 for asm_id in assemblies: for gene in assemblies[asm_id].genes(): for mRNA in gene.mRNAs(): exon_count = mRNA.exon_count() if args.max_exon_count is None or exon_count <= args.max_exon_count: mRNA_count += 1 if args.retain_composition is True: mRNAs_by_exon_count[exon_count].append(mRNA) else: mRNAs.append(mRNA) # if you feel like printing a profile #for exon_count in mRNAs_by_exon_count: # print("DEBUG: exons:{0}\tcount:{1}".format( exon_count, len(mRNAs_by_exon_count[exon_count]) ) ) # sanity check on the number of available mRNAs if (args.training_set_size + args.evaluation_set_size) > mRNA_count: raise Exception("ERROR: acceptable mRNA count ({0}) is less than combined training_set_size ({1}) and evaluation_set_size ({2}) options".format(mRNA_count, args.training_set_size, args.evaluation_set_size) ) training_mRNAs = list() evaluation_mRNAs = list() if args.retain_composition is True: print("DEBUG: retaining composition") pass else: training_mRNAs = random.sample( mRNAs, args.training_set_size ) unselected_mRNAs = list(set(mRNAs) & set(set(mRNAs) ^ set(training_mRNAs))) evaluation_mRNAs = random.sample( unselected_mRNAs, args.evaluation_set_size ) export_mRNAs_to_file(training_mRNAs, args.output_training_file) export_mRNAs_to_file(evaluation_mRNAs, args.output_evaluation_file)
def main(): parser = argparse.ArgumentParser( description='Converts GFF3 files to GO Gene Association Format (GAF)') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-go', '--go_file', type=str, required=True, help='Gene Ontology (GO) file' ) parser.add_argument('-db', '--database', type=str, required=True, help='Database issuing that IDs. Example: UniProtKB' ) parser.add_argument('-dbref', '--db_reference', type=str, required=True, help='DB reference, like PMID:2676709 (column 6)' ) parser.add_argument('-ec', '--evidence_code', type=str, required=False, default='IEA', help='Like IEA (column 7)' ) parser.add_argument('-t', '--taxon_id', type=int, required=True, help='NCBI taxon ID (column 13)' ) parser.add_argument('-ad', '--annotation_date', type=str, required=False, help='Annotation date in YYYYMMDD format. Default = GFF3 file datestamp' ) parser.add_argument('-ab', '--assign_by', type=str, required=False, help='Assign by (column 15) Defaults to --database argument value' ) args = parser.parse_args() print("INFO: Parsing GFF3 objects", file=sys.stderr) (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) print("INFO: Parsing GO file", file=sys.stderr) go_lookup = parse_go_file(args.go_file) annot_date = args.annotation_date if annot_date is None: annot_date = time.strftime('%Y%m%d', time.gmtime(os.path.getmtime(args.input_file))) assign_by = args.assign_by if assign_by is None: assign_by = args.database ofh = open(args.output_file, 'wt') ofh.write("!gaf-version: 2.0\n") for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): for polypeptide in mRNA.polypeptides(): for go_annot in polypeptide.annotation.go_annotations: go_id = "GO:{0}".format(go_annot.go_id) product = None gene_sym = None if go_id not in go_lookup: raise Exception("ERROR: GO ID {0} not found in provided go.obo file".format(go_id)) if polypeptide.annotation.product_name is not None: product = polypeptide.annotation.product_name if polypeptide.annotation.gene_symbol is not None: gene_sym = polypeptide.annotation.gene_symbol # Aspect is F, P or C, depending on which component/ontology the term comes from ofh.write("{0}\t{1}\t{1}\t\t{2}\t{3}\t{4}\t\t{5}\t{6}" "\t{7}\tprotein\ttaxon:{8}\t{9}\t{10}\t" "\t\n".format(args.database, polypeptide.id, go_id, args.db_reference, args.evidence_code, go_lookup[go_id], product, gene_sym, args.taxon_id, annot_date, assign_by)) print("INFO: Conversion complete.", file=sys.stderr)
def main(): parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF') parser.add_argument('-p', '--print_n_with_stops', type=int, required=False, default=0, help='Optional. Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' ) parser.add_argument('-o', '--output_fasta', type=str, required=False, help='Optional. Writes an output (translated) FASTA file for all those features which had internal stops') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_stops = 0 # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it. debug_mRNA = None fasta_out_fh = None if args.output_fasta is not None: fasta_out_fh = open(args.output_fasta, 'wt') for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 if debug_mRNA is not None and mRNA.id == debug_mRNA: print("CDS:{0}".format(coding_seq)) if biocodeutils.translate(coding_seq).rstrip('*').count('*') > 0: mRNAs_with_stops += 1 translated_seq = biocodeutils.translate(coding_seq) if fasta_out_fh is not None: loc = mRNA.location_on(assemblies[assembly_id]) fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand) ) fasta_out_fh.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq))) if debug_mRNA is not None and mRNA.id == debug_mRNA: print("TRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) ) if mRNAs_with_stops <= args.print_n_with_stops: print("\nmRNA id: {0}".format(mRNA.id) ) print("\tCDS:{0}".format(coding_seq)) print("\tTRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) ) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
def main(): flawed_gff_file = 'canonical.flawed.gff3' ilri_gff = 'Theileria-all-Theileria1_ourids.gff' source = 'GenBank' out_gff = 'canonical.corrected.gff3' fout = open(out_gff, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") (assemblies, features) = biocodegff.get_gff3_features( flawed_gff_file ) print("INFO: loaded {0} assemblies and {1} features".format(len(assemblies), len(features))) polypeptides = dict() for line in open(ilri_gff): cols = line.split("\t") if len(cols) != 9 or cols[2] != 'polypeptide': continue id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') polypeptides[parent] = biothings.Polypeptide( id=id, parent=parent ) polypeptides[parent].locate_on(target=assemblies[cols[0]], fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) print("DEBUG: loaded {0} polypeptides from ILRI file".format(len(polypeptides)) ) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): if mRNA.id not in polypeptides: print("DEBUG: {0} not found as a parent to any polypeptide".format(mRNA.id)) else: polypeptide = polypeptides[mRNA.id] # pull this outside of the iteration since iterating might delete some CDSs = mRNA.CDSs() for CDS in CDSs: keep = True if CDS < polypeptide: mRNA.delete_CDS(CDS) elif CDS <= polypeptide: CDS.location().fmin = polypeptide.location().fmin if CDS > polypeptide: mRNA.delete_CDS(CDS) elif CDS >= polypeptide: CDS.location().fmax = polypeptide.location().fmax #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \ # CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \ # polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax)) gene.print_as(fh=fout, source=source, format='gff3')
def main(): parser = argparse.ArgumentParser( description='Basic comparison of two GFF3 files') ## output file to be written parser.add_argument('-r', '--ref', type=str, required=True, help='Path to the reference GFF3 file' ) parser.add_argument('-q', '--qry', type=str, required=True, help='Path to the query GFF3 file' ) parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name/path of the output files to be created' ) args = parser.parse_args() (assemblies, ref_features) = biocodegff.get_gff3_features( args.ref ) ref_genes = get_genes_from_dict(ref_features) (assemblies, qry_features) = biocodegff.get_gff3_features( args.qry, assemblies=assemblies ) qry_genes = get_genes_from_dict(qry_features) ref_matches_found = dict() qry_matches_found = dict() for ref_gene in ref_genes: for qry_gene in qry_genes: if ref_gene.has_same_coordinates_as( thing=qry_gene ) and \ ref_gene.shares_exon_structure_with( thing=qry_gene ) and \ ref_gene.shares_CDS_structure_with( thing=qry_gene ): ref_matches_found[ref_gene.id] = qry_gene.id qry_matches_found[qry_gene.id] = ref_gene.id # open our output files out_matches = open("{0}.matches".format(args.output_base), 'wt') out_summary = open("{0}.summary".format(args.output_base), 'wt') print("INFO: {0}/{1} reference genes had a match to a qry gene".format( len(ref_matches_found), len(ref_genes) )) print("INFO: {0}/{1} qry genes had a match to a reference gene".format( len(qry_matches_found), len(qry_genes) )) for ref_gene_id in ref_matches_found: out_matches.write("{0}\t{1}\n".format(ref_gene_id, ref_matches_found[ref_gene_id])) out_summary.write("Reference\t{0}\n".format(args.ref) ) out_summary.write("Query\t{0}\n".format(args.ref) ) out_summary.write("Total identical models (with respect to reference)\t{0}\n".format(len(ref_matches_found))) out_summary.write("Models in REF not in QRY\t{0}\n".format( len(ref_genes) - len(ref_matches_found) )) out_summary.write("Models in QRY not in REF\t{0}\n".format( len(qry_genes) - len(qry_matches_found) ))
def main(): parser = argparse.ArgumentParser( description='Script for reporting of possible polycistronic genes transcripts based on a reference annotation and RNA-seq transcript assemblies') ## output file to be written parser.add_argument('-r', '--reference_file', type=str, required=True, help='GFF3 file of a reference annotation' ) parser.add_argument('-q', '--query_file', type=str, required=True, help='GFF3 file with alternative annotation (such as an RNA-seq assemby)' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) args = parser.parse_args() (ref_assemblies, ref_feats) = biocodegff.get_gff3_features(args.reference_file) (qry_assemblies, qry_genes) = biocodegff.get_gff3_features(args.query_file) for assembly_id in ref_assemblies: # we expect to find this assembly ID in the qry set too if assembly_id not in qry_assemblies: print("WARN: expected to find assembly_id {0} in both reference and query sets".format(assembly_id)) continue for ref_gene in ref_assemblies[assembly_id].genes(): overlaps = list() polypeptides = ref_gene.polypeptides() if len(polypeptides) == 0: print("WARN: skipped gene {0} because it has no polypeptides".format(ref_gene.id)) continue ref_annot = ref_gene.polypeptides()[0].annotation for qry_gene in qry_assemblies[assembly_id].genes(): overlap = ref_gene.overlaps_with(qry_gene) if overlap: #print("DEBUG: {0} and {1} appear to overlap".format(ref_gene.id, qry_gene.id) ) overlaps.append(overlap) # add a dbxref to the gene ref_annot.add_dbxref("overlaps_old_locusTagID:{0}".format(qry_gene.id)) if len(overlaps) > 0: print("INFO: ref_gene {0} had {1} overlaps".format(ref_gene.id, len(overlaps))) biocodegff.print_gff3_from_assemblies(assemblies=ref_assemblies, ofh=open(args.output_file, 'w'))
def main(): parser = argparse.ArgumentParser( description='Create a TBL file for submission to NCBI from GFF3') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' ) parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name of output files to be created' ) parser.add_argument('-ln', '--lab_name', type=str, required=True, help='Required by NCBI to identify the submitting group' ) parser.add_argument('-nap', '--ncbi_acc_prefix', type=str, required=True, help='Required and assigned by NCBI' ) parser.add_argument('-gf', '--genomic_fasta', type=str, required=False, help='FASTA file of genomic sequence, if not embedded in GFF' ) parser.add_argument('-go', '--go_obo', type=str, required=False, help='GO terms will not be exported unless you pass the path to a GO OBO file') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) if args.genomic_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genomic_fasta) new_assemblies = dict() ## We need to first check the ID format reformat_IDs = True ## maps old IDs (like tp.assembly.567468735.1) to new ones (like AAGK01000001) asm_id_map = dict() asm_num = 1 for asm_id in assemblies: # pre-formatted IDs are like this: gnl|WGS:XXXX|SeqID|gb|XXXX01xxxxxx if asm_id.startswith('gnl|WGS:'): reformat_IDs = False break else: new_id = "gnl|WGS:{0}|SeqID|gb|{0}01{1:06d}".format(args.ncbi_acc_prefix, asm_num) asm_id_map[asm_id] = new_id asm_num += 1 new_assemblies[new_id] = assemblies[asm_id] new_assemblies[new_id].id = new_id if reformat_IDs == True: assemblies = new_assemblies # >gi|68352484|gb|AAGK01000001.1| # AAGK01000001 NC_007344.1 tp.assembly.567468735.1 ofh = open("{0}.tbl".format(args.output_base), 'wt') biocodetbl.print_tbl_from_assemblies(assemblies=assemblies, ofh=ofh, go_obo=args.go_obo, lab_name=args.lab_name) mset = biothings.AssemblySet() mset.load_from_dict(assemblies) mset.write_fasta(path="{0}.fna".format(args.output_base))
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-go', '--go_obo', type=str, required=False, help='GO terms will not be exported unless you pass the path to a GO OBO file') parser.add_argument('-ln', '--lab_name', type=str, required=True, help='Required by NCBI to identify the submitting group' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) ofh = open(args.output_file, 'wt') biocodetbl.print_tbl_from_assemblies(assemblies=assemblies, ofh=ofh, go_obo=args.go_obo, lab_name=args.lab_name)
def main(): parser = argparse.ArgumentParser( description='Removes gene models whose sequence has been masked.') ## output file to be written parser.add_argument('-i', '--input_gff3', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-m', '--masked_fasta', type=str, required=True, help='FASTA with sequence masked with N characters') parser.add_argument('-p', '--percent_repeat_coverage_cutoff', type=int, required=True, help='Genes with an mRNA covered by this percentage of repeats will be excluded' ) parser.add_argument('-o', '--output_gff3', type=str, required=False, help='Path to GFF3 output file to be created') parser.add_argument('-r', '--removed_gff3', type=str, required=False, help='If passed, writes the deleted genes to this file') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_gff3 ) biocodeutils.add_assembly_fasta(assemblies, args.masked_fasta) gff_out = open(args.output_gff3, 'wt') gff_out.write("##gff-version 3\n") rem_out = None gene_count = 0 kept_count = 0 if args.removed_gff3 is not None: rem_out = open(args.removed_gff3, 'wt') rem_out.write("##gff-version 3\n") for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): keep = True gene_count += 1 for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() n_count = coding_seq.count('N') perc_repeat = (n_count / len(coding_seq)) * 100 if perc_repeat >= args.percent_repeat_coverage_cutoff: keep = False if keep == True: kept_count += 1 gene.print_as(fh=gff_out, source='IGS', format='gff3') else: if rem_out is not None: gene.print_as(fh=rem_out, source='IGS', format='gff3') print("INFO: {0} genes kept out of {1} ({2:.1f}%)".format(kept_count, gene_count, ((kept_count/gene_count) * 100)))
def main(): parser = argparse.ArgumentParser( description='Splits all GFF3 mRNA isoforms into their own gene models') ## Get the variables parser.add_argument('-i', '--input_file', type=str, required=True, help='Input GFF3 file') parser.add_argument('-o', '--output_file', type=str, required=True, help='Output GFF3 file') args = parser.parse_args() ofh = open(args.output_file, 'wt') print("INFO: Parsing GFF3 features\n") (assemblies, ref_features) = biocodegff.get_gff3_features(args.input_file) print("INFO: Finding genes with isoforms and splitting them\n") ofh.write("##gff-version 3\n") for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): # only changing the gene features with isoforms if len(gene.mRNAs()) > 1: counter = 1 for mRNA in gene.mRNAs(): new_gene_id = str(gene.id) + "_" + str(counter) counter += 1 mRNA_loc = mRNA.location() print("Splitting " + gene.id) # create a new gene model, correcting the gene coords to the mRNA coords new_gene = biothings.Gene(id=new_gene_id) new_gene.locate_on(target=assemblies[assembly_id], fmin=mRNA_loc.fmin, fmax=mRNA_loc.fmax, strand=mRNA_loc.strand) mRNA.parent.id = new_gene_id #Now add the mRNA to the gene model new_gene.add_mRNA(mRNA) # print out the new gene model new_gene.print_as(fh=ofh, source='IGS', format='gff3') else: gene.print_as(fh=ofh, source='IGS', format='gff3')
def main(): parser = argparse.ArgumentParser( description='Checks for genes with multiple mRNA children and creates new genes for each.') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") for assembly_id in assemblies: current_assembly = assemblies[assembly_id] for gene in assemblies[assembly_id].genes(): rnas_found = 0 mRNAs = gene.mRNAs() for mRNA in mRNAs: mRNA_loc = mRNA.location_on(current_assembly) rnas_found += 1 if rnas_found > 1: gene.remove_mRNA(mRNA) print("INFO: splitting mRNA off gene {0}".format(gene.id)) new_gene = biothings.Gene( id="{0}_{1}".format(gene.id, rnas_found) ) new_gene.locate_on(target=current_assembly, fmin=mRNA_loc.fmin, fmax=mRNA_loc.fmax, strand=mRNA_loc.strand) new_gene.add_RNA(mRNA) new_gene.print_as(fh=ofh, format='gff3') if len(mRNAs) > 1: gene_loc = gene.location_on(current_assembly) mRNA_loc = mRNAs[0].location_on(current_assembly) gene_loc.fmin = mRNA_loc.fmin gene_loc.fmax = mRNA_loc.fmax gene_loc.strand = mRNA_loc.strand gene.print_as(fh=ofh, format='gff3')
def main(): parser = argparse.ArgumentParser( description='Shortens gene feature coordinates to their longest child mRNA') ## output file to be written parser.add_argument('-i', '--input_gff3', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-o', '--output_gff3', type=str, required=True, help='Path to GFF3 output file to be created') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_gff3 ) gff_out = open(args.output_gff3, 'wt') gff_out.write("##gff-version 3\n") for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): gene_loc = gene.location() # loop through the mRNAs and store the outer boundaries of those found min_coord = None max_coord = None mRNAs = gene.mRNAs() if len(mRNAs) >= 1: for mRNA in mRNAs: mRNA_loc = mRNA.location() if min_coord is None or mRNA_loc.fmin < min_coord: min_coord = mRNA_loc.fmin if max_coord is None or mRNA_loc.fmax > max_coord: max_coord = mRNA_loc.fmax if min_coord != gene_loc.fmin or max_coord != gene_loc.fmax: print("DEBUG: Changed gene {0} from {1}-{2} to {3}-{4}".format(gene.id, gene_loc.fmin, gene_loc.fmax, min_coord, max_coord)) gene_loc.fmin = min_coord gene_loc.fmax = max_coord gene.print_as(fh=gff_out, source='IGS', format='gff3')
def main(): parser = argparse.ArgumentParser( description= 'Checks the CDS features against a genome sequence report non-terminal internal stops.' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-p', '--print_n_with_stops', type=int, required=False, default=0, help= 'Optional. Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' ) parser.add_argument( '-o', '--output_fasta', type=str, required=False, help= 'Optional. Writes an output (translated) FASTA file for all those features which had internal stops' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_stops = 0 # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it. debug_mRNA = None fasta_out_fh = None if args.output_fasta is not None: fasta_out_fh = open(args.output_fasta, 'wt') for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 if debug_mRNA is not None and mRNA.id == debug_mRNA: print("CDS:{0}".format(coding_seq)) if biocodeutils.translate(coding_seq).rstrip('*').count( '*') > 0: mRNAs_with_stops += 1 translated_seq = biocodeutils.translate(coding_seq) if fasta_out_fh is not None: loc = mRNA.location_on(assemblies[assembly_id]) fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format( mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand)) fasta_out_fh.write("{0}\n".format( biocodeutils.wrapped_fasta(translated_seq))) if debug_mRNA is not None and mRNA.id == debug_mRNA: print("TRANSLATION WITH STOP ({1}): {0}".format( translated_seq, mRNA.id)) if mRNAs_with_stops <= args.print_n_with_stops: print("\nmRNA id: {0}".format(mRNA.id)) print("\tCDS:{0}".format(coding_seq)) print("\tTRANSLATION WITH STOP ({1}): {0}".format( translated_seq, mRNA.id)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
def main(): gm_es_file = 'genemark_hmm.gff3' cegma_file = 'output.cegma.gff3' #aat_file = 'bail_training_genes.aat.1500maxintron.80percid.gff3' aat_file = 'aat.bail_hominis_filtered_training.gff3' #aat_file = 'aat.merged.gff3' print("INFO: parsing Genemark-ES data") (assemblies, gm_es_features) = biocodegff.get_gff3_features( gm_es_file ) gm_es_genes = get_genes_from_dict(gm_es_features) print("\tINFO: Got {0} Genemark-ES genes".format(len(gm_es_genes))) print("INFO: parsing CEGMA data") (assemblies, cegma_features) = biocodegff.get_gff3_features( cegma_file, assemblies=assemblies ) cegma_genes = get_genes_from_dict(cegma_features) print("\tINFO: Got {0} CEGMA genes".format(len(cegma_genes))) print("INFO: parsing AAT results") (assemblies, aat_muris_features) = biocodegff.get_gff3_features( aat_file, assemblies=assemblies) aat_genes = get_genes_from_dict(aat_muris_features) print("\tINFO: Got {0} AAT 'genes'".format(len(aat_genes))) genemark_cegma_shared_genes = list() gmes_cegma_fh = open('gmes_cegma.shared.ids', 'wt') for gm_es_gene in gm_es_genes: for cegma_gene in cegma_genes: if gm_es_gene.has_same_coordinates_as( thing=cegma_gene ): if gm_es_gene.shares_exon_structure_with( thing=cegma_gene ) == True: genemark_cegma_shared_genes.append(gm_es_gene) gmes_cegma_fh.write("{0}\n".format(gm_es_gene.id)) break print("\n{0} genes were shared perfectly between Genemark-ES and CEGMA".format(len(genemark_cegma_shared_genes)) ) ############################################################################# genemark_aat_shared_genes = list() gmes_aat_fh = open('gmes_aat.shared.ids', 'wt') for gm_es_gene in gm_es_genes: for aat_gene in aat_genes: if gm_es_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True ) == True: genemark_aat_shared_genes.append(gm_es_gene) gmes_aat_fh.write("{0}\n".format(gm_es_gene.id)) break print("{0} Genemark-ES genes had an exact AAT match".format(len(genemark_aat_shared_genes)) ) ############################################################################## cegma_matching_gm_es = list() genemark_aat_cegma_shared_genes = list() gmes_aat_cegma_fh = open('gmes_aat_cegma.shared.ids', 'wt') for cegma_gene in cegma_genes: match_found = False for gm_es_gene in gm_es_genes: if cegma_gene.has_same_coordinates_as( thing=gm_es_gene ): if cegma_gene.shares_exon_structure_with( thing=gm_es_gene ) == True: match_found = True if gm_es_gene in genemark_aat_shared_genes and gm_es_gene not in genemark_aat_cegma_shared_genes: genemark_aat_cegma_shared_genes.append(gm_es_gene) gmes_aat_cegma_fh.write("{0}\n".format(gm_es_gene.id)) break if match_found == True: cegma_matching_gm_es.append(cegma_gene) print("{0} genes with GeneMark-ES, CEGMA and AAT agreement".format(len(genemark_aat_cegma_shared_genes)) ) training_fh = open('training_gene.ids', 'wt') for gene in genemark_aat_cegma_shared_genes: training_fh.write("{0}\n".format(gene.id) ) ############################################################################## cegma_with_aat_not_gm_es = list() cegma_aat_nogmes_fh = open('cegma_aat_nogmes.shared.ids', 'wt') for cegma_gene in cegma_genes: if cegma_gene in cegma_matching_gm_es: continue for aat_gene in aat_genes: if cegma_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True ) == True: cegma_with_aat_not_gm_es.append(cegma_gene) cegma_aat_nogmes_fh.write("{0}\n".format(cegma_gene.id)) break print("{0} CEGMA genes had no GeneMark-ES match but did have an AAT one".format(len(cegma_with_aat_not_gm_es)) )
def main(): parser = argparse.ArgumentParser( description='') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' ) parser.add_argument('-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA') parser.add_argument('-t', '--type', type=str, required=False, default='mRNA', choices=['mRNA', 'CDS'], help='Feature type to export (mRNA or CDS)') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) # set this to None if you don't want the debug print statements #debugging_gene = 'D9AE6116893A0D5711D56C0F1E6CF58C' debugging_gene = None if args.fasta is not None: seqs = biocodeutils.fasta_dict_from_file( args.fasta ) for seq_id in seqs: if seq_id in assemblies: assemblies[seq_id].residues = seqs[seq_id]['s'] assemblies[seq_id].length = len(assemblies[seq_id].residues) ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') for assembly_id in assemblies: assembly = assemblies[assembly_id] for gene in assembly.genes(): if debugging_gene is not None: debug_mode = True if gene.id != debugging_gene: continue else: debug_mode = False if gene.locus_tag is None: gene_label = gene.id else: gene_label = gene.locus_tag gene_seq = gene.get_residues().upper() gene_loc = gene.location_on(assembly) ## we have to do this here because of the coordinates if gene_loc.strand == -1: gene_seq = "".join(reversed(gene_seq)) if debug_mode: print("INFO: Processing gene with length {0} at {1}-{2}".format(len(gene_seq), gene_loc.fmin, gene_loc.fmax)) if len(gene.mRNAs()) > 1: #raise Exception("ERROR: script doesn't currently support multi-isoform genes, but found one: {0}".format(gene.id)) print("ERROR: skipping gene {0} because it appears to have multiple isoforms (not currently supported)".format(gene.id)) continue for mRNA in gene.mRNAs(): introns = mRNA.introns( on=assembly ) # this helps us get where the intron is on the gene offset = gene_loc.fmin for intron in introns: intron_loc = intron.location_on(assembly) lower_mid = gene_seq[intron_loc.fmin - offset:intron_loc.fmax - offset].lower() gene_seq = gene_seq[0:intron_loc.fmin - offset] + lower_mid + gene_seq[intron_loc.fmax - offset:] if debug_mode: print("INFO:\tfound intron at {0}-{1}".format(intron_loc.fmin, intron_loc.fmax)) print("INFO:\tlower-casing offset adjusted coordinates: {0}-{1}".format(intron_loc.fmin - offset, intron_loc.fmax - offset)) print("INFO:\tgenerating lower case seq of length: {0}\n".format(len(lower_mid)) ) if debug_mode: print("INFO: seq length before CDS processing is: {0}".format(len(gene_seq))) ## do we need to trim down to the CDS range? if args.type == 'CDS': CDSs = sorted(mRNA.CDSs()) CDS_min = CDSs[0].location_on(assembly).fmin CDS_max = CDSs[-1].location_on(assembly).fmax if debug_mode: print("INFO: Calculated CDS range, with introns, should be: {0}-{1}={2}".format(CDS_max, CDS_min, CDS_max - CDS_min)) if gene_loc.fmin != CDS_min or gene_loc.fmax != CDS_max: fmin_chomp = CDS_min - offset fmax_chomp = gene_loc.fmax - CDS_max if debug_mode: print("gene:{0} coords:{1}-{2} ({3}), CDS coords: {4}-{5}".format(gene.id, gene_loc.fmin, \ gene_loc.fmax, gene_loc.strand, \ CDS_min, CDS_max \ )) print("\tfmin_chomp:{0}, fmax_chomp:{1}".format(fmin_chomp, fmax_chomp)) print("\tpulling range: gene_seq[{0} : {1}]".format(fmin_chomp, len(gene_seq) - fmax_chomp)) gene_seq = gene_seq[fmin_chomp : len(gene_seq) - fmax_chomp] if debug_mode: print("\tGene {0} CDS seq: {1}".format(gene.id, gene_seq)) ## make sure to switch it back if gene_loc.strand == -1: gene_seq = "".join(reversed(gene_seq)) #print("INFO: Got gene with length {0} after modification".format(len(gene_seq))) ofh.write(">{0}\n{1}\n".format(gene_label, biocodeutils.wrapped_fasta(gene_seq)))
def main(): parser = argparse.ArgumentParser( description= 'Script for reporting of possible polycistronic genes transcripts based on a reference annotation and RNA-seq transcript assemblies' ) ## output file to be written parser.add_argument('-r', '--reference_file', type=str, required=True, help='GFF3 file of a reference annotation') parser.add_argument( '-q', '--query_file', type=str, required=True, help= 'GFF3 file with alternative annotation (such as an RNA-seq assemby)') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') args = parser.parse_args() (ref_assemblies, ref_feats) = biocodegff.get_gff3_features(args.reference_file) (qry_assemblies, qry_genes) = biocodegff.get_gff3_features(args.query_file) for assembly_id in ref_assemblies: # we expect to find this assembly ID in the qry set too if assembly_id not in qry_assemblies: print( "WARN: expected to find assembly_id {0} in both reference and query sets" .format(assembly_id)) continue for ref_gene in ref_assemblies[assembly_id].genes(): overlaps = list() polypeptides = ref_gene.polypeptides() if len(polypeptides) == 0: print("WARN: skipped gene {0} because it has no polypeptides". format(ref_gene.id)) continue ref_annot = ref_gene.polypeptides()[0].annotation for qry_gene in qry_assemblies[assembly_id].genes(): overlap = ref_gene.overlaps_with(qry_gene) if overlap: #print("DEBUG: {0} and {1} appear to overlap".format(ref_gene.id, qry_gene.id) ) overlaps.append(overlap) # add a dbxref to the gene ref_annot.add_dbxref("overlaps_old_locusTagID:{0}".format( qry_gene.id)) if len(overlaps) > 0: print("INFO: ref_gene {0} had {1} overlaps".format( ref_gene.id, len(overlaps))) biocodegff.print_gff3_from_assemblies(assemblies=ref_assemblies, ofh=open(args.output_file, 'w'))
def main(): parser = argparse.ArgumentParser( description='Provides coverage information for features in a GFF3 file' ) ## output file to be written parser.add_argument( 'evidence_files', metavar='N', type=str, nargs='+', help='Path to one or more evidence files, separated by spaces') parser.add_argument( '-r', '--reference', type=str, required=True, help= 'Input path to the reference GFF3 file. So we know what feature type to report on, format should be like FILE:TYPE' ) parser.add_argument('-f', '--fasta', type=str, required=True, help='Input path to the reference FASTA file.') parser.add_argument( '-o', '--output_file', type=str, required=False, help= 'Optional path to an output file to be created, else prints on STDOUT') args = parser.parse_args() ## parse the fasta fasta = biocodeutils.fasta_dict_from_file(args.fasta) ## open the output file fout = None if args.output_file is None: fout = codecs.getwriter('utf8')(sys.stdout.buffer) else: fout = open(args.output_file, "w") #################################################### ## Sanity checks allowed_extensions = ['bed', 'gff3', 'pileup', 'sam'] for ev_file in args.evidence_files: valid_ext_found = False for ext in allowed_extensions: if ev_file.endswith(ext): valid_ext_found = True if valid_ext_found == False: raise Exception( "ERROR: Evidence file passed with unsupported file extension: {0}. Supported extensions are {1}" .format(ev_file, allowed_extensions)) ## The input file should be defined as $path:$feattype if ':' not in args.reference: raise Exception( "ERROR: input_file must be like /path/to/some.gff3:mRNA") ref_file_parts = args.reference.split(':') print("DEBUG: part count: {0}".format(len(ref_file_parts))) if ref_file_parts[0].endswith('.gff3'): (ref_assemblies, ref_features) = biocodegff.get_gff3_features(ref_file_parts[0]) else: raise Exception( "ERROR: Expected input file (-i) to have a gff3 extension, got {0}" .format(ref_file_parts[0])) #################################################### ## Initialize the coverage arrays fasta_cov = dict() for seq_id in fasta: # create a list of 0s the length of the molecule fasta_cov[seq_id] = [0] * len(fasta[seq_id]['s']) #################################################### ## Now parse the evidence files for ev_file in args.evidence_files: if ev_file.endswith('pileup'): parse_pileup(fasta_cov, ev_file) elif ev_file.endswith('sam'): parse_sam(fasta_cov, ev_file) else: print( "INFO: ignoring evidence file {0} because code to handle its file type isn't currently implemented" .format(ev_file)) for id in fasta_cov: covered_bases = 0 for i in fasta_cov[id]: if fasta_cov[id][i] > 0: covered_bases += 1 fout.write("{0}\t{1}\t{2}\n".format(id, len(fasta[id]['s']), covered_bases))
def main(): parser = argparse.ArgumentParser( description='Converts GFF3 into a GenBank flat file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read') parser.add_argument( '-o', '--output_file', type=str, required=False, help= 'Path to a Genbank flat file to be created. Supersedes --output_dir if both are specified.' ) parser.add_argument( '-od', '--output_dir', type=str, required=False, help= 'Path to an output directory. If this option is specified then each input assembly will be written to a separate GenBank output file, named with the assembly_id.' ) parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument('-mt', '--molecule_type', type=str, required=False, default='DNA', help='Molecule type') parser.add_argument('-gbd', '--genbank_division', type=str, required=False, default='.', help='GenBank Division (3-letter abbreviation)') parser.add_argument( '-md', '--modification_date', type=str, required=False, default='DD-MMM-YYYY', help='The modification date for header in format like 21-JUN-1999') parser.add_argument('-org', '--organism', type=str, required=False, default='.', help='Full organism name (including strain)') parser.add_argument( '-str', '--strain', type=str, required=False, help= "Only the strain designation, which is written to the FEATURES.source element" ) parser.add_argument( '-d', '--definition', type=str, required=False, default='.', help= 'Brief description of sequence; includes information such as source organism, gene name/protein name, or some description of the sequence\'s function.' ) parser.add_argument( '-s', '--source', type=str, required=False, default='.', help= 'Free-format information including an abbreviated form of the organism name, sometimes followed by a molecule type.' ) parser.add_argument('-t', '--taxon_id', type=int, required=False, help='NCBI taxon ID, if known') parser.add_argument( '-l', '--lineage', type=str, required=False, default='Unknown', help= 'Semicolon-delimited lineage of the organism e.g., "Eukaryota; Alveolata; Apicomplexa; Aconoidasida; Piroplasmida; Theileriidae; Theileria"' ) parser.add_argument( '-seq', '--include_sequence', action='store_true', help='Include sequence (if present) in the output GenBank flat file(s).' ) parser.add_argument( '-p', '--locus_id_prefix', required=False, default='', help= 'Prefix to add to the GenBank LOCUS id in the output GenBank flat file(s).' ) args = parser.parse_args() # check that output directory exists if args.output_dir is not None: if not os.path.isdir(args.output_dir): sys.stderr.write("FATAL: the specified output directory (" + args.output_dir + ") does not exist\n") exit(1) # line-wrap lineage to stay below 79 character GenBank flat file width lineage = biocodegenbank.line_wrap_lineage_string(args.lineage) (assemblies, features) = biocodegff.get_gff3_features(args.input_file) ofh = sys.stdout if args.output_file is not None: if args.output_dir is None: ofh = open(args.output_file, 'wt') else: sys.stderr.write( "WARN: both -o/--output_file and -od/--output_dir were passed so the former will be ignored\n" ) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: process_assembly_fasta(assemblies, args.genome_fasta) for assembly_id in assemblies: locus_id = args.locus_id_prefix + assembly_id if args.output_dir is not None: ofn = args.output_dir + "/" + locus_id + ".gbk" ofh = open(ofn, 'wt') assembly = assemblies[assembly_id] context = { 'locus': locus_id, 'molecule_size': assembly.length, 'molecule_type': args.molecule_type, 'division': args.genbank_division, 'modification_date': args.modification_date, 'accession': '.', 'version': '.', 'source': args.source, 'definition': args.definition, 'organism': args.organism, 'lineage': lineage } header = TEMPLATE_ENVIRONMENT.get_template( 'genbank_flat_file_header.template').render(context) ofh.write(header) ofh.write("\nFEATURES Location/Qualifiers\n") ofh.write(" source 1..{0}\n".format(assembly.length)) ofh.write(" /organism=\"{0}\"\n".format( args.organism)) ofh.write(" /mol_type=\"genomic DNA\"\n") if args.strain is not None: ofh.write(" /strain=\"{0}\"\n".format( args.strain)) if args.taxon_id is not None: ofh.write(" /db_xref=\"taxon:{0}\"\n".format( args.taxon_id)) for gene in assemblies[assembly_id].genes(): biocodegenbank.print_biogene(gene=gene, fh=ofh, on=assembly) if args.include_sequence: ofh.write("ORIGIN\n") biocodegenbank.print_sequence(seq=assembly.residues, fh=ofh) ofh.write("//\n") # there may be multiple output files if args.output_dir is not None: ofh.close() # there is only one output file if args.output_dir is None: ofh.close()
def main(): parser = argparse.ArgumentParser( description='Removes gene models whose sequence has been masked.') ## output file to be written parser.add_argument('-i', '--input_gff3', type=str, required=True, help='Path to the input GFF3') parser.add_argument('-m', '--masked_fasta', type=str, required=True, help='FASTA with sequence masked with N characters') parser.add_argument( '-p', '--percent_repeat_coverage_cutoff', type=int, required=True, help= 'Genes with an mRNA covered by this percentage of repeats will be excluded' ) parser.add_argument('-o', '--output_gff3', type=str, required=False, help='Path to GFF3 output file to be created') parser.add_argument( '-r', '--removed_gff3', type=str, required=False, help='If passed, writes the deleted genes to this file') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_gff3) biocodeutils.add_assembly_fasta(assemblies, args.masked_fasta) gff_out = open(args.output_gff3, 'wt') gff_out.write("##gff-version 3\n") rem_out = None gene_count = 0 kept_count = 0 if args.removed_gff3 is not None: rem_out = open(args.removed_gff3, 'wt') rem_out.write("##gff-version 3\n") for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): keep = True gene_count += 1 for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() n_count = coding_seq.count('N') perc_repeat = (n_count / len(coding_seq)) * 100 if perc_repeat >= args.percent_repeat_coverage_cutoff: keep = False if keep == True: kept_count += 1 gene.print_as(fh=gff_out, source='IGS', format='gff3') else: if rem_out is not None: gene.print_as(fh=rem_out, source='IGS', format='gff3') print("INFO: {0} genes kept out of {1} ({2:.1f}%)".format( kept_count, gene_count, ((kept_count / gene_count) * 100)))
def main(): parser = argparse.ArgumentParser( description= 'Checks the CDS features against a genome sequence to report/correct phase columns.' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-s', '--source', type=str, required=False, default='.', help='Optional. Sets the value for column 2 in all rows. Default = .' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) fout = open(args.output_file, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") # deal with the FASTA file if the user passed one if args.genome_fasta is not None: process_assembly_fasta(assemblies, args.genome_fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): for CDS in mRNA.CDSs(): check_and_update_phase(CDS) gene.print_as(fh=fout, source=args.source, format='gff3') fasta_header_written = False for assembly_id in assemblies: if assemblies[assembly_id].length > 0: if fasta_header_written is False: fout.write("##FASTA\n") fasta_header_written = True fout.write(">{0}\n".format(assemblies[assembly_id].id)) fout.write("{0}\n".format( biocodeutils.wrapped_fasta(assemblies[assembly_id].residues)))
def main(): parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created') parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export') parser.add_argument( '-f', '--fasta', type=str, required=False, help= 'If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' ) parser.add_argument('--check_ends', dest='check_ends', action='store_true') parser.set_defaults(check_ends=False) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # only doing the standard codon table for now start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] ## add sequence residues from external FASTA file if the user passed one if args.fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): ## initial values of id and header to export (can be overridden by available annotation) export_id = mRNA.id export_header = None if mRNA.locus_tag is not None: export_id = mRNA.locus_tag ## Add the gene product name if there is one for polypeptide in mRNA.polypeptides(): if polypeptide.annotation is not None: if polypeptide.annotation.product_name is not None: export_header = polypeptide.annotation.product_name break fout.write(">{0}".format(export_id)) if export_header is not None: fout.write(" {0}\n".format(export_header)) else: fout.write("\n") coding_seq = mRNA.get_CDS_residues() if args.check_ends == True: # check the starting codon start_codon = coding_seq[0:3].upper() if start_codon not in start_codons: sys.stderr.write( "WARN: Non-canonical start codon ({0}) in mRNA {1}\n" .format(start_codon, mRNA.id)) stop_codon = coding_seq[-3:].upper() if stop_codon not in stop_codons: sys.stderr.write( "WARN: Non-canonical stop codon ({0}) in mRNA {1}\n" .format(stop_codon, mRNA.id)) if args.type == 'cds': fout.write("{0}\n".format( biocodeutils.wrapped_fasta(coding_seq))) else: translated_seq = biocodeutils.translate(coding_seq) fout.write("{0}\n".format( biocodeutils.wrapped_fasta(translated_seq)))
def main(): ''' This script reports statistics on the areas of a genome where features aren't - introns and intergenic space. Pass a valid GFF3 file (along with FASTA data) and get a report like this: Molecule count: 9 Gene count: 4171 Intergenic space count: 4061 Average intergenic space distance: 361.7 bp Median intergenic space distance: 245 bp Minimum intergenic space distance: 0 bp Maximum intergenic space distance: 6272 bp Intron count: 10533 Intron space count: 989024 Average intron size: 93.9 bp Median intron size: 63 bp Minimum intron size: 2 bp Maximum intron size: 1676 bp Optionally, you can pass the path to a PNG file to be created using the --histogram parameter, which will generate a size distribution histogram with two overlaying plots - one representing the distribution of intergenic region sizes and the other the intron lengths. Because these can often have long tails, you can limit both the Y- and X-axes values with the --ylimit and --xlimit options, respectively. FASTA: If your FASTA isn't embedded at the end of your GFF3 file after a ##FASTA directive you'll need to specify the --fasta option in this script and pass it as a separate file. Definitions: Intergenic space was a little ambiguous to me as I started writing this. Does one count the space from the beginning of the contig until the first gene, or only between them? What about short contigs which have no annotated genes at all? From the Sequence Ontology: SO:0000605: A region containing or overlapping no genes that is bounded on either side by a gene, or bounded by a gene and the end of the chromosome. To my reading, this includes contig ends but not gene-less contigs. To that end, I include the former in intergenic space reporting but include the latter as a separate statistic. Author: Joshua Orvis (jorvis AT gmail) ''' parser = argparse.ArgumentParser( description= 'Reports statistics of reference gene coverage and extension by aligned RNA-seq transcript data.' ) ## output file to be written parser.add_argument('-i', '--input_gff3', type=str, required=True, help='GFF3 file of a reference annotation') parser.add_argument( '-g', '--histogram', type=str, required=False, help= 'Optional path to a histogram of intron/intergenic space size distribution to be created (PNG)' ) parser.add_argument( '-x', '--xlimit', type=int, required=False, help= 'Use this if you want to limit the X-axis of the histogram (feature length)' ) parser.add_argument( '-y', '--ylimit', type=int, required=False, help= 'Use this if you want to limit the Y-axis of the histogram (feature count)' ) parser.add_argument( '-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_gff3) if args.fasta is not None: seqs = biocodeutils.fasta_dict_from_file(args.fasta) for seq_id in seqs: if seq_id in assemblies: assemblies[seq_id].residues = seqs[seq_id]['s'] assemblies[seq_id].length = len(assemblies[seq_id].residues) ## things to keep stats on and report total_molecule_count = len(assemblies) total_gene_count = 0 ## this number is NOT just the total genes N - 1, since there can be multiple molecules # genes can overlap, etc. total_intergenic_space_count = 0 total_intergenic_space_residues = 0 intergenic_distances = list() total_contig_residues = 0 empty_contig_residues = 0 total_intron_count = 0 total_intron_residues = 0 intron_sizes = list() ############################ ## Calculation section ############################ for asm_id in assemblies: #print("DEBUG: processing assembly: {0}".format(asm_id)) assembly = assemblies[asm_id] genes = sorted(assembly.genes()) total_gene_count += len(genes) previous_gene_loc = None # we should have a length here if assembly.length is None or assembly.length == 0: raise Exception( "ERROR: Detected assembly with undefined or 0 length: {0}". format(assembly.id)) if total_gene_count == 0: empty_contig_residues += assembly.length continue total_contig_residues += assembly.length first_gene_loc = None last_gene_loc = None for gene in genes: gene_loc = gene.location_on(assembly) # if this is the first gene, track the number of bases from the start of the molecule here if first_gene_loc is None: total_intergenic_space_count += 1 intergenic_distance = gene_loc.fmin total_intergenic_space_residues += intergenic_distance intergenic_distances.append(intergenic_distance) first_gene_loc = gene_loc if previous_gene_loc is not None: ## skip this gene if it overlaps the previous if gene_loc.fmin < previous_gene_loc.fmax: if gene_loc.fmax > previous_gene_loc.fmax: previous_gene_loc = gene_loc else: total_intergenic_space_count += 1 intergenic_distance = gene_loc.fmin - previous_gene_loc.fmax total_intergenic_space_residues += intergenic_distance intergenic_distances.append(intergenic_distance) for mRNA in gene.mRNAs(): introns = mRNA.introns(on=assembly) for intron in sorted(introns): total_intron_count += 1 intron_loc = intron.location_on(assembly) intron_size = intron_loc.fmax - intron_loc.fmin #if intron_size > 0: #print("\tDEBUG: found mRNA:{0} intron {1}-{2} ({3} bp)".format(mRNA.id, intron_loc.fmin, intron_loc.fmax, intron_size)) if intron_size < 0: print( "\tWARN: Intron size ({1}) < 0 reported in gene {0}" .format(gene.id, intron_size)) intron_sizes.append(intron_size) total_intron_residues += intron_size previous_gene_loc = gene_loc last_gene_loc = previous_gene_loc if last_gene_loc is not None: total_intergenic_space_count += 1 intergenic_distance = assembly.length - last_gene_loc.fmax total_intergenic_space_residues += intergenic_distance intergenic_distances.append(intergenic_distance) if total_intergenic_space_count == 0: avg_intergenic_space_dist = None intergenic_distances = None median_int_space_dist = None else: avg_intergenic_space_dist = total_intergenic_space_residues / total_intergenic_space_count intergenic_distances = sorted(intergenic_distances) median_int_space_dist = intergenic_distances[int( len(intergenic_distances) / 2)] avg_intron_size = total_intron_residues / total_intron_count intron_sizes = sorted(intron_sizes) median_intron_size = intron_sizes[int(len(intron_sizes) / 2)] ############################ ## Reporting section ############################ print("\nMolecule count: {0}".format(total_molecule_count)) print("Gene count: {0}".format(total_gene_count)) print("\nTotal molecule bases: {0} bp".format(total_contig_residues)) print("Empty molecule bases: {0} bp".format(empty_contig_residues)) if total_intergenic_space_count > 0: print( "Intergenic space count: {0}".format(total_intergenic_space_count)) print("Average intergenic space distance: {0:.1f} bp".format( avg_intergenic_space_dist)) print("Median intergenic space distance: {0} bp".format( median_int_space_dist)) print("Minimum intergenic space distance: {0} bp".format( intergenic_distances[0])) print("Maximum intergenic space distance: {0} bp\n".format( intergenic_distances[-1])) else: print( "There were no intergenic spaces found. This might mean there were no molecules with at least 2 genes." ) print("Intron count: {0}".format(total_intron_count)) print("Intron space count: {0}".format(total_intron_residues)) print("Average intron size: {0:.1f} bp".format(avg_intron_size)) print("Median intron size: {0} bp".format(median_intron_size)) print("Minimum intron size: {0} bp".format(intron_sizes[0])) print("Maximum intron size: {0} bp\n".format(intron_sizes[-1])) ############################ ## Graphics section (optional) ############################ if args.histogram is not None: import matplotlib.pyplot as plt plt.xlabel('length (bp)') plt.ylabel('count') plt.title('Distribution of intron size and intergenic distances') plt.hist(intergenic_distances, bins=50, histtype='stepfilled', color='b', label='Intergenic distances') plt.hist(intron_sizes, bins=50, histtype='stepfilled', color='r', alpha=0.5, label='Intron sizes') if args.xlimit is not None: plt.xlim([0, args.xlimit]) if args.ylimit is not None: plt.ylim([0, args.ylimit]) plt.legend(loc='best') plt.savefig(args.histogram)
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') parser.add_argument('-a', '--organism1_annotation', type=str, required=True, help='Annotation GFF for organism 1' ) parser.add_argument('-p', '--organism1_aat_alignments', type=str, required=True, help='Path to AAT GFF3 (match/match_part)' ) parser.add_argument('-aatdb', '--aat_fasta_db', type=str, required=True, help='Path to FASTA database that was used in AAT' ) parser.add_argument('-b', '--organism1_blast_alignments', type=str, required=True, help='Path to BLASTp btab file vs.organism 2 proteins' ) parser.add_argument('-be', '--blast_eval_cutoff', type=float, required=False, default=1e-5, help='BLAST e-value cutoff' ) parser.add_argument('-bpi', '--blast_percent_identity_cutoff', type=float, required=False, default=0, help='BLAST %identity cutoff' ) parser.add_argument('-ppc', '--aat_percent_coverage_cutoff', type=float, required=False, default=0, help='% coverage of the query protein by the AAT match' ) parser.add_argument('-o', '--output_id_list', type=str, required=False, help='List of IDs from organism1 that passed' ) args = parser.parse_args() debugging_transcript = None ## if the output file wasn't passed build one from the other parameters if args.output_id_list is None: args.output_id_list = "training_ids.be_{0}.bpi_{1}.ppc_{2}.list".format(args.blast_eval_cutoff, args.blast_percent_identity_cutoff, args.aat_percent_coverage_cutoff) print("INFO: Parsing organism1 annotation") (assemblies, features) = biocodegff.get_gff3_features( args.organism1_annotation ) print("INFO: Parsing AAT FASTA database") aat_seqs = biocodeutils.fasta_dict_from_file( args.aat_fasta_db ) # keys are assembly IDs, value for each is a list of matches on them aat_matches = dict() aat_match_count = 0 current_match = None ## IDs of features in organism 1 which overlap AAT o1_with_aat = list() o1_with_o2 = list() print("INFO: Parsing organism1 AAT protein alignments") for line in open(args.organism1_aat_alignments): cols = line.split("\t") if line.startswith('#') or len(cols) != 9: continue assembly_id = cols[0] # skip this match if there were not predicted genes on the same assembly if assembly_id not in assemblies: continue if assembly_id not in aat_matches: aat_matches[assembly_id] = list() fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] feature_id = biocodegff.column_9_value(cols[8], 'ID').replace('"', '') target = biocodegff.column_9_value(cols[8], 'Target') m = re.search("^(\S+)", target) if m: target = m.group(1) if cols[2] == 'nucleotide_to_protein_match': if current_match is not None: aat_matches[assembly_id].append(current_match) aat_match_count += 1 current_match = biothings.Match( id=feature_id, target_id=target, subclass='nucleotide_to_protein_match', length=fmax - fmin ) current_match.locate_on( target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand ) elif cols[2] == 'match_part': parent_id = biocodegff.column_9_value(cols[8], 'Parent').replace('"', '') match_part = biothings.MatchPart( id=feature_id, parent=parent_id, length=fmax - fmin ) match_part.locate_on( target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand ) current_match.add_part(match_part) print("INFO: Parsed {0} protein alignment chains".format(aat_match_count)) print("INFO: Comparing organism1's mRNAs with AAT match coordinates") for assembly_id in assemblies: if assembly_id not in aat_matches: continue assembly = assemblies[assembly_id] for gene in assembly.genes(): for mRNA in gene.mRNAs(): if debugging_transcript is not None: if mRNA.id == debugging_transcript: print("DEBUG: processing debugging transcript: {0}".format(mRNA.id)) else: continue for aat_match in aat_matches[assembly_id]: #print("DEBUG: about to call overlap_size_with {0} and {1}, which has {2} segments".format(mRNA.id, aat_match.id, len(aat_match.parts)) ) overlap_size = mRNA.overlap_size_with(aat_match) if overlap_size is not None: #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4})".format(mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length) ) # this shouldn't be possible, but check just in case if overlap_size > mRNA.length: raise Exception("ERROR: overlap size ({0}) > mRNA length ({1})".format(overlap_size, mRNA.length)) if aat_match.target_id not in aat_seqs: raise Exception("ERROR: Found match with target ID ({0}) but didn't find a FASTA entry for it via -aatdb".format(aat_match.target_id)) # this is a protein length, so x3 match_target_length = len(aat_seqs[aat_match.target_id]['s']) * 3 (mRNA_percent_coverage, target_percent_coverage) = calculate_fragmented_coverage(mRNA, aat_match, match_target_length) #print("DEBUG: mRNA_percent_coverage:{0}".format(mRNA_percent_coverage) ) #print("DEBUG: match_percent_coverage:{0}".format(target_percent_coverage) ) if mRNA_percent_coverage >= args.aat_percent_coverage_cutoff and target_percent_coverage >= args.aat_percent_coverage_cutoff: o1_with_aat.append(mRNA.id) #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4}), match target id:{5}, length:{6}".format( \ # mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length, \ # aat_match.target_id, match_target_length) ) #print("\tmRNA % cov: {0}".format(mRNA_percent_coverage)) #print("\ttarget % cov: {0}".format(target_percent_coverage)) break # only need to see if one matched print("INFO: Found {0} mRNAs in org1 with overlapping fungi AAT coordinates".format(len(o1_with_aat))) # key=org1_transcript_id, value=org2_transcript_id top_blast_hits = dict() print("INFO: parsing BLAST results vs. org2") for line in open(args.organism1_blast_alignments): cols = line.split("\t") if float(cols[19]) > args.blast_eval_cutoff: continue if float(cols[10]) < args.blast_percent_identity_cutoff: continue # if we survived until here, this one's good. top_blast_hits[cols[0]] = cols[5] print("INFO: Comparing overlap between AAT-matched proteins and BLAST ones") for o1_mRNA_id in o1_with_aat: if o1_mRNA_id in top_blast_hits: o1_with_o2.append(o1_mRNA_id) print("INFO: Found {0} mRNAs in org1 with overlapping AAT coordinates and BLAST hit to org2".format(len(o1_with_o2))) id_list_fh = open(args.output_id_list, 'wt') for mRNA_id in o1_with_o2: id_list_fh.write("{0}\n".format(mRNA_id))
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('-r', '--reference', type=str, required=True, help='Reference GFF3' ) parser.add_argument('-q', '--query', type=str, required=True, help='Query GFF3' ) args = parser.parse_args() print("INFO: parsing reference features\n") (assemblies, ref_features) = biocodegff.get_gff3_features( args.reference ) print("INFO: parsing query features\n") (assemblies, qry_features) = biocodegff.get_gff3_features( args.query, assemblies=assemblies ) ref_genes = get_genes_from_dict( ref_features ) qry_genes = get_genes_from_dict( qry_features ) ref_gene_one_qry_overlap = {} qry_gene_one_ref_overlap = {} #Find all of the query genes that overlap the reference gene at all for ref_gene in sorted(ref_genes): ref_loc = ref_gene.location() num_qry_overlaps = {} #keep track of the number of query RNAs that have at least one CDS that overlaps qry_to_ref = {} #keep track of the query:ref relationships for printing out later for qry_gene in sorted(qry_genes): qry_loc = qry_gene.location() if ref_gene.overlaps_with( qry_gene ): for ref_RNA in ref_gene.RNAs(): for qry_RNA in qry_gene.RNAs(): for ref_CDS in ref_RNA.CDSs(): ref_CDS_loc = ref_CDS.location() for qry_CDS in qry_RNA.CDSs(): qry_CDS_loc = qry_CDS.location() if ref_CDS.overlaps_with( qry_CDS ) and qry_CDS_loc.strand is ref_CDS_loc.strand: #Does the ref CDS overlap the query CDS? num_qry_overlaps[qry_gene.id] = 1 #If so, add the qry_gene.id to the list of overlaps qry_to_ref[qry_gene.id] = ref_gene.id #Also, keep track of the query:ref relationships #Store all of the reference genes that overlap only a single query gene for qry_gene in num_qry_overlaps: if len(num_qry_overlaps) == 1: ref_gene_one_qry_overlap[qry_to_ref[qry_gene]] = qry_gene #print(str(qry_gene) + "\t" + str(qry_to_ref[qry_gene]))$ #Now do the same thing finding all ref overlaps for each query gene for qry_gene in sorted(qry_genes): qry_loc = qry_gene.location() num_ref_overlaps = {} ref_to_qry = {} for ref_gene in sorted(ref_genes): ref_loc = ref_gene.location() if qry_gene.overlaps_with( ref_gene ): for qry_RNA in qry_gene.RNAs(): for ref_RNA in ref_gene.RNAs(): for qry_CDS in qry_RNA.CDSs(): qry_CDS_loc = qry_CDS.location() for ref_CDS in ref_RNA.CDSs(): ref_CDS_loc = ref_CDS.location() if qry_CDS.overlaps_with( ref_CDS ) and qry_CDS_loc.strand is ref_CDS_loc.strand: num_ref_overlaps[ref_gene.id] = 1 ref_to_qry[ref_gene.id] = qry_gene.id #Store all of the wry genes that overlap only a single reference gene for ref_gene in num_ref_overlaps: if len(num_ref_overlaps) == 1: qry_gene_one_ref_overlap[ref_to_qry[ref_gene]] = ref_gene #Find all of the reference genes with only one query overlap and vice versa and print them out for qry_gene_id in qry_gene_one_ref_overlap: for ref_gene_id in ref_gene_one_qry_overlap: if qry_gene_id is ref_gene_one_qry_overlap[ref_gene_id] and ref_gene_id is qry_gene_one_ref_overlap[qry_gene_id]: print(qry_gene_id + "\t" + ref_gene_id)
def main(): parser = argparse.ArgumentParser( description='Converts GFF3 files to GO Gene Association Format (GAF)') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') parser.add_argument('-go', '--go_file', type=str, required=True, help='Gene Ontology (GO) file') parser.add_argument('-db', '--database', type=str, required=True, help='Database issuing that IDs. Example: UniProtKB') parser.add_argument('-dbref', '--db_reference', type=str, required=True, help='DB reference, like PMID:2676709 (column 6)') parser.add_argument('-ec', '--evidence_code', type=str, required=False, default='IEA', help='Like IEA (column 7)') parser.add_argument('-t', '--taxon_id', type=int, required=True, help='NCBI taxon ID (column 13)') parser.add_argument( '-ad', '--annotation_date', type=str, required=False, help= 'Annotation date in YYYYMMDD format. Default = GFF3 file datestamp') parser.add_argument( '-ab', '--assign_by', type=str, required=False, help='Assign by (column 15) Defaults to --database argument value') args = parser.parse_args() print("INFO: Parsing GFF3 objects", file=sys.stderr) (assemblies, features) = biocodegff.get_gff3_features(args.input_file) print("INFO: Parsing GO file", file=sys.stderr) go_lookup = parse_go_file(args.go_file) annot_date = args.annotation_date if annot_date is None: annot_date = time.strftime( '%Y%m%d', time.gmtime(os.path.getmtime(args.input_file))) assign_by = args.assign_by if assign_by is None: assign_by = args.database ofh = open(args.output_file, 'wt') ofh.write("!gaf-version: 2.0\n") for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): for polypeptide in mRNA.polypeptides(): for go_annot in polypeptide.annotation.go_annotations: go_id = "GO:{0}".format(go_annot.go_id) product = None gene_sym = None if go_id not in go_lookup: raise Exception( "ERROR: GO ID {0} not found in provided go.obo file" .format(go_id)) if polypeptide.annotation.product_name is not None: product = polypeptide.annotation.product_name if polypeptide.annotation.gene_symbol is not None: gene_sym = polypeptide.annotation.gene_symbol # Aspect is F, P or C, depending on which component/ontology the term comes from ofh.write("{0}\t{1}\t{1}\t\t{2}\t{3}\t{4}\t\t{5}\t{6}" "\t{7}\tprotein\ttaxon:{8}\t{9}\t{10}\t" "\t\n".format(args.database, polypeptide.id, go_id, args.db_reference, args.evidence_code, go_lookup[go_id], product, gene_sym, args.taxon_id, annot_date, assign_by)) print("INFO: Conversion complete.", file=sys.stderr)
def main(): parser = argparse.ArgumentParser(description='') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created') parser.add_argument( '-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA') parser.add_argument('-t', '--type', type=str, required=False, default='mRNA', choices=['mRNA', 'CDS'], help='Feature type to export (mRNA or CDS)') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # set this to None if you don't want the debug print statements #debugging_gene = 'D9AE6116893A0D5711D56C0F1E6CF58C' debugging_gene = None if args.fasta is not None: seqs = biocodeutils.fasta_dict_from_file(args.fasta) for seq_id in seqs: if seq_id in assemblies: assemblies[seq_id].residues = seqs[seq_id]['s'] assemblies[seq_id].length = len(assemblies[seq_id].residues) ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') for assembly_id in assemblies: assembly = assemblies[assembly_id] for gene in assembly.genes(): if debugging_gene is not None: debug_mode = True if gene.id != debugging_gene: continue else: debug_mode = False if gene.locus_tag is None: gene_label = gene.id else: gene_label = gene.locus_tag gene_seq = gene.get_residues().upper() gene_loc = gene.location_on(assembly) ## we have to do this here because of the coordinates if gene_loc.strand == -1: gene_seq = "".join(reversed(gene_seq)) if debug_mode: print( "INFO: Processing gene with length {0} at {1}-{2}".format( len(gene_seq), gene_loc.fmin, gene_loc.fmax)) if len(gene.mRNAs()) > 1: #raise Exception("ERROR: script doesn't currently support multi-isoform genes, but found one: {0}".format(gene.id)) print( "ERROR: skipping gene {0} because it appears to have multiple isoforms (not currently supported)" .format(gene.id)) continue for mRNA in gene.mRNAs(): introns = mRNA.introns(on=assembly) # this helps us get where the intron is on the gene offset = gene_loc.fmin for intron in introns: intron_loc = intron.location_on(assembly) lower_mid = gene_seq[intron_loc.fmin - offset:intron_loc.fmax - offset].lower() gene_seq = gene_seq[0:intron_loc.fmin - offset] + lower_mid + gene_seq[ intron_loc.fmax - offset:] if debug_mode: print("INFO:\tfound intron at {0}-{1}".format( intron_loc.fmin, intron_loc.fmax)) print( "INFO:\tlower-casing offset adjusted coordinates: {0}-{1}" .format(intron_loc.fmin - offset, intron_loc.fmax - offset)) print( "INFO:\tgenerating lower case seq of length: {0}\n" .format(len(lower_mid))) if debug_mode: print("INFO: seq length before CDS processing is: {0}". format(len(gene_seq))) ## do we need to trim down to the CDS range? if args.type == 'CDS': CDSs = sorted(mRNA.CDSs()) CDS_min = CDSs[0].location_on(assembly).fmin CDS_max = CDSs[-1].location_on(assembly).fmax if debug_mode: print( "INFO: Calculated CDS range, with introns, should be: {0}-{1}={2}" .format(CDS_max, CDS_min, CDS_max - CDS_min)) if gene_loc.fmin != CDS_min or gene_loc.fmax != CDS_max: fmin_chomp = CDS_min - offset fmax_chomp = gene_loc.fmax - CDS_max if debug_mode: print("gene:{0} coords:{1}-{2} ({3}), CDS coords: {4}-{5}".format(gene.id, gene_loc.fmin, \ gene_loc.fmax, gene_loc.strand, \ CDS_min, CDS_max \ )) print("\tfmin_chomp:{0}, fmax_chomp:{1}".format( fmin_chomp, fmax_chomp)) print( "\tpulling range: gene_seq[{0} : {1}]".format( fmin_chomp, len(gene_seq) - fmax_chomp)) gene_seq = gene_seq[fmin_chomp:len(gene_seq) - fmax_chomp] if debug_mode: print("\tGene {0} CDS seq: {1}".format( gene.id, gene_seq)) ## make sure to switch it back if gene_loc.strand == -1: gene_seq = "".join(reversed(gene_seq)) #print("INFO: Got gene with length {0} after modification".format(len(gene_seq))) ofh.write(">{0}\n{1}\n".format( gene_label, biocodeutils.wrapped_fasta(gene_seq)))
def main(): flawed_gff_file = 'canonical.flawed.gff3' ilri_gff = 'Theileria-all-Theileria1_ourids.gff' source = 'GenBank' out_gff = 'canonical.corrected.gff3' fout = open(out_gff, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") (assemblies, features) = biocodegff.get_gff3_features(flawed_gff_file) print("INFO: loaded {0} assemblies and {1} features".format( len(assemblies), len(features))) polypeptides = dict() for line in open(ilri_gff): cols = line.split("\t") if len(cols) != 9 or cols[2] != 'polypeptide': continue id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') polypeptides[parent] = biothings.Polypeptide(id=id, parent=parent) polypeptides[parent].locate_on(target=assemblies[cols[0]], fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) print("DEBUG: loaded {0} polypeptides from ILRI file".format( len(polypeptides))) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): if mRNA.id not in polypeptides: print( "DEBUG: {0} not found as a parent to any polypeptide". format(mRNA.id)) else: polypeptide = polypeptides[mRNA.id] # pull this outside of the iteration since iterating might delete some CDSs = mRNA.CDSs() for CDS in CDSs: keep = True if CDS < polypeptide: mRNA.delete_CDS(CDS) elif CDS <= polypeptide: CDS.location().fmin = polypeptide.location().fmin if CDS > polypeptide: mRNA.delete_CDS(CDS) elif CDS >= polypeptide: CDS.location().fmax = polypeptide.location().fmax #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \ # CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \ # polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax)) gene.print_as(fh=fout, source=source, format='gff3')
def main(): gm_es_file = 'genemark_hmm.gff3' cegma_file = 'output.cegma.gff3' transcript_file = 'expression_data.gff3' aat_muris_file = 'cmuris.aat.gff3' aat_parvum_file = 'cparvum.aat.gff3' html_out_file = 'gene_classifications.html' html_out = open(html_out_file, 'wt') # for each gene, how many flanking bases should be shown on either side? flanking_bases = 1000 type1_best = list() type2_best = list() type2_better = list() type3_still_better = list() print("INFO: parsing Genemark-ES data") (assemblies, gm_es_features) = biocodegff.get_gff3_features(gm_es_file) gm_es_genes = get_genes_from_dict(gm_es_features) print("\tINFO: Got {0} Genemark-ES genes".format(len(gm_es_genes))) print("INFO: parsing CEGMA data") (assemblies, cegma_features) = biocodegff.get_gff3_features(cegma_file, assemblies=assemblies) cegma_genes = get_genes_from_dict(cegma_features) print("\tINFO: Got {0} CEGMA genes".format(len(cegma_genes))) print("INFO: parsing expression data (Trinity, Cufflinks, GMAP cDNAs)") (assemblies, transcript_features) = biocodegff.get_gff3_features(transcript_file, assemblies=assemblies) transcript_genes = get_genes_from_dict(transcript_features) print("\tINFO: Got {0} expression 'genes'".format(len(transcript_genes))) print("INFO: parsing AAT results (C. muris)") (assemblies, aat_muris_features) = biocodegff.get_gff3_features(aat_muris_file, assemblies=assemblies) aat_muris_genes = get_genes_from_dict(aat_muris_features) print("\tINFO: Got {0} AAT (C. muris) 'genes'".format( len(aat_muris_genes))) print("INFO: parsing AAT results (C. parvum)") (assemblies, aat_parvum_features) = biocodegff.get_gff3_features(aat_parvum_file, assemblies=assemblies) aat_parvum_genes = get_genes_from_dict(aat_parvum_features) print("\tINFO: Got {0} AAT (C. parvum) 'genes'".format( len(aat_parvum_genes))) #biocodeutils.add_assembly_fasta(assemblies, args.masked_fasta) genemark_cegma_shared_genes = list() for gm_es_gene in gm_es_genes: for cegma_gene in cegma_genes: if gm_es_gene.has_same_coordinates_as(thing=cegma_gene): if gm_es_gene.shares_exon_structure_with( thing=cegma_gene) == True: genemark_cegma_shared_genes.append(gm_es_gene) break print("\n{0} genes were shared perfectly between Genemark-ES and CEGMA". format(len(genemark_cegma_shared_genes))) ############################################################################# genemark_aat_shared_genes = list() for gm_es_gene in gm_es_genes: for aat_gene in aat_muris_genes: if gm_es_gene.shares_exon_structure_with(thing=aat_gene) == True: genemark_aat_shared_genes.append(gm_es_gene) break if gm_es_gene not in genemark_aat_shared_genes: for aat_gene in aat_parvum_genes: if gm_es_gene.shares_exon_structure_with( thing=aat_gene) == True: genemark_aat_shared_genes.append(gm_es_gene) break print("{0} Genemark-ES genes had an exact AAT match".format( len(genemark_aat_shared_genes))) ############################################################################## cegma_not_matching_gm_es = list() for cegma_gene in cegma_genes: match_found = False for gm_es_gene in gm_es_genes: if cegma_gene.has_same_coordinates_as(thing=gm_es_gene): if cegma_gene.shares_exon_structure_with( thing=gm_es_gene) == True: match_found = True break if match_found == False: cegma_not_matching_gm_es.append(cegma_gene) print("{0} CEGMA genes don't have a structural match to a Genemark-ES one". format(len(cegma_not_matching_gm_es))) ############################################################################# gm_expression_shared_genes = list() for gm_es_gene in gm_es_genes: for tf in transcript_genes: if gm_es_gene.shares_CDS_structure_with(tf): gm_expression_shared_genes.append(gm_es_gene) break print("{0} Genemark-ES genes had an exact expression match".format( len(gm_expression_shared_genes))) ############################################################################# gm_cegma_expression_shared_genes = list() for shared_gene in genemark_cegma_shared_genes: if shared_gene in gm_expression_shared_genes: gm_cegma_expression_shared_genes.append(shared_gene) print( "{0} genes were shared perfectly between Genemark-ES and CEGMA with expression support" .format(len(gm_cegma_expression_shared_genes))) ############################################################################## gm_cegma_expression_aat_shared_genes = list() for shared_gene in gm_cegma_expression_shared_genes: if shared_gene in genemark_aat_shared_genes: gm_cegma_expression_aat_shared_genes.append(shared_gene) else: type2_best.append(shared_gene) for gene in gm_cegma_expression_aat_shared_genes: type1_best.append(gene) print( "{0} genes were shared with Genemark-ES, CEGMA, expression, AAT support" .format(len(gm_cegma_expression_aat_shared_genes))) ############################################################################## for gm_es_gene in gm_es_genes: if gm_es_gene not in genemark_cegma_shared_genes: if gm_es_gene in gm_expression_shared_genes: type2_better.append(gm_es_gene) ############################################################################## cegma_expression_shared_genes = list() for cegma_gene in cegma_genes: for tf in transcript_genes: if cegma_gene.shares_CDS_structure_with(tf): cegma_expression_shared_genes.append(cegma_gene) break print("{0} CEGMA genes had an exact expression match".format( len(cegma_expression_shared_genes))) ############################################################################## cegma_not_gmes_with_aat = list() for cegma_gene in cegma_genes: if cegma_gene in cegma_not_matching_gm_es: for aat_gene in aat_muris_genes: if cegma_gene.shares_exon_structure_with( thing=aat_gene) == True: cegma_not_gmes_with_aat.append(cegma_gene) break if cegma_gene not in cegma_not_gmes_with_aat: for aat_gene in aat_parvum_genes: if cegma_gene.shares_exon_structure_with( thing=aat_gene) == True: cegma_not_gmes_with_aat.append(gm_es_gene) break if cegma_gene in cegma_not_gmes_with_aat: if cegma_gene in cegma_expression_shared_genes: type3_still_better.append(cegma_gene) print("TYPE 1 - BEST: {0}".format(len(type1_best))) print("TYPE 2 - BEST: {0}".format(len(type2_best))) print("TYPE 2 - BETTER: {0}".format(len(type2_better))) print("TYPE 3 - STILL BETTER: {0}".format(len(type3_still_better))) html_out.write("<!doctype html>\n") html_out.write("<html lang=\"en\">\n") html_out.write( "<head><meta charset=\"utf-8\"><title>Gene classification</title></head>\n" ) html_out.write("<body>\n") html_out.write("<h3>Type 1 - Best ({0})</h3>\n".format(len(type1_best))) print_gene_list(html_out, type1_best, flanking_bases) html_out.write("<h3>Type 2 - Best ({0})</h3>\n".format(len(type2_best))) print_gene_list(html_out, type2_best, flanking_bases) html_out.write("<h3>Type 2 - Better ({0})</h3>\n".format( len(type2_better))) print_gene_list(html_out, type2_better, flanking_bases) html_out.write("<h3>Type 3 - Still better ({0})</h3>\n".format( len(type3_still_better))) print_gene_list(html_out, type3_still_better, flanking_bases) html_out.write("</body>\n") html_out.write("</html>\n")
def main(): ''' This script reports statistics on the areas of a genome where features aren't - introns and intergenic space. Pass a valid GFF3 file and get a report like this: Molecule count: 9 Gene count: 4171 Intergenic space count: 4061 Average intergenic space distance: 361.7 bp Median intergenic space distance: 245 bp Minimum intergenic space distance: 0 bp Maximum intergenic space distance: 6272 bp Intron count: 10533 Intron space count: 989024 Average intron size: 93.9 bp Median intron size: 63 bp Minimum intron size: 2 bp Maximum intron size: 1676 bp Optionally, you can pass the path to a PNG file to be created using the --histogram parameter, which will generate a size distribution histogram with two overlaying plots - one representing the distribution of intergenic region sizes and the other the intron lengths. Because these can often have long tails, you can limit both the Y- and X-axes values with the --ylimit and --xlimit options, respectively. ''' parser = argparse.ArgumentParser( description='Reports statistics of reference gene coverage and extension by aligned RNA-seq transcript data.') ## output file to be written parser.add_argument('-i', '--input_gff3', type=str, required=True, help='GFF3 file of a reference annotation' ) parser.add_argument('-g', '--histogram', type=str, required=False, help='Optional path to a histogram of intron/intergenic space size distribution to be created (PNG)' ) parser.add_argument('-x', '--xlimit', type=int, required=False, help='Use this if you want to limit the X-axis of the histogram (feature length)' ) parser.add_argument('-y', '--ylimit', type=int, required=False, help='Use this if you want to limit the Y-axis of the histogram (feature count)' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_gff3 ) ## things to keep stats on and report total_molecule_count = len(assemblies) total_gene_count = 0 ## this number is NOT just the total genes N - 1, since there can be multiple molecules # genes can overlap, etc. total_intergenic_space_count = 0 total_intergenic_space_residues = 0 intergenic_distances = list() total_intron_count = 0 total_intron_residues = 0 intron_sizes = list() ############################ ## Calculation section ############################ for asm_id in assemblies: assembly = assemblies[asm_id] genes = assembly.genes() total_gene_count += len(genes) last_gene_loc = None for gene in sorted(genes): gene_loc = gene.location_on(assembly) if last_gene_loc is not None: ## skip this gene if it overlaps the previous if gene_loc.fmin < last_gene_loc.fmax: if gene_loc.fmax > last_gene_loc.fmax: last_gene_loc = gene_loc else: total_intergenic_space_count += 1 intergenic_distance = gene_loc.fmin - last_gene_loc.fmax total_intergenic_space_residues += intergenic_distance intergenic_distances.append(intergenic_distance) for mRNA in gene.mRNAs(): introns = mRNA.introns( on=assembly ) for intron in sorted(introns): total_intron_count += 1 intron_loc = intron.location_on(assembly) intron_size = intron_loc.fmax - intron_loc.fmin if intron_size < 0: print("WARN: Intron size ({1}) < 0 reported in gene {0}".format(gene.id, intron_size)) intron_sizes.append(intron_size) total_intron_residues += intron_size last_gene_loc = gene_loc avg_intergenic_space_dist = total_intergenic_space_residues / total_intergenic_space_count intergenic_distances = sorted(intergenic_distances) median_int_space_dist = intergenic_distances[ int(len(intergenic_distances)/2) ] avg_intron_size = total_intron_residues / total_intron_count intron_sizes = sorted(intron_sizes) median_intron_size = intron_sizes[int(len(intron_sizes)/2)] ############################ ## Reporting section ############################ print("\nMolecule count: {0}\n".format(total_molecule_count)) print("Gene count: {0}".format(total_gene_count) ) print("Intergenic space count: {0}".format(total_intergenic_space_count) ) print("Average intergenic space distance: {0:.1f} bp".format(avg_intergenic_space_dist) ) print("Median intergenic space distance: {0} bp".format(median_int_space_dist) ) print("Minimum intergenic space distance: {0} bp".format(intergenic_distances[0]) ) print("Maximum intergenic space distance: {0} bp\n".format(intergenic_distances[-1]) ) print("Intron count: {0}".format(total_intron_count) ) print("Intron space count: {0}".format(total_intron_residues) ) print("Average intron size: {0:.1f} bp".format(avg_intron_size) ) print("Median intron size: {0} bp".format(median_intron_size) ) print("Minimum intron size: {0} bp".format(intron_sizes[0]) ) print("Maximum intron size: {0} bp\n".format(intron_sizes[-1]) ) ############################ ## Graphics section (optional) ############################ if args.histogram is not None: import matplotlib.pyplot as plt plt.xlabel('length (bp)') plt.ylabel('count') plt.title('Distribution of intron size and intergenic distances') plt.hist(intergenic_distances, bins=50, histtype='stepfilled', color='b', label='Intergenic distances' ) plt.hist(intron_sizes, bins=50, histtype='stepfilled', color='r', alpha=0.5, label='Intron sizes' ) if args.xlimit is not None: plt.xlim([0, args.xlimit]) if args.ylimit is not None: plt.ylim([0, args.ylimit]) plt.legend(loc='best') plt.savefig(args.histogram)
def main(): parser = argparse.ArgumentParser( description='Create a TBL file for submission to NCBI from GFF3') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read') parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name of output files to be created') parser.add_argument( '-ln', '--lab_name', type=str, required=True, help='Required by NCBI to identify the submitting group') parser.add_argument('-nap', '--ncbi_acc_prefix', type=str, required=True, help='Required and assigned by NCBI') parser.add_argument( '-gf', '--genomic_fasta', type=str, required=False, help='FASTA file of genomic sequence, if not embedded in GFF') parser.add_argument( '-go', '--go_obo', type=str, required=False, help= 'GO terms will not be exported unless you pass the path to a GO OBO file' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) if args.genomic_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genomic_fasta) new_assemblies = dict() ## We need to first check the ID format reformat_IDs = True ## maps old IDs (like tp.assembly.567468735.1) to new ones (like AAGK01000001) asm_id_map = dict() asm_num = 1 for asm_id in assemblies: # pre-formatted IDs are like this: gnl|WGS:XXXX|SeqID|gb|XXXX01xxxxxx if asm_id.startswith('gnl|WGS:'): reformat_IDs = False break else: new_id = "gnl|WGS:{0}|SeqID|gb|{0}01{1:06d}".format( args.ncbi_acc_prefix, asm_num) asm_id_map[asm_id] = new_id asm_num += 1 new_assemblies[new_id] = assemblies[asm_id] new_assemblies[new_id].id = new_id if reformat_IDs == True: assemblies = new_assemblies ofh = open("{0}.tbl".format(args.output_base), 'wt') biocodetbl.print_tbl_from_assemblies(assemblies=assemblies, ofh=ofh, go_obo=args.go_obo, lab_name=args.lab_name) mset = biothings.AssemblySet() mset.load_from_dict(assemblies) mset.write_fasta(path="{0}.fna".format(args.output_base))
def process_files(args): (assemblies_1, features_1) = biocodegff.get_gff3_features(args.annotation_1) (assemblies_2, features_2) = biocodegff.get_gff3_features(args.annotation_2) a_exons = [] ## Set contains only uniq exons from known annotation, since multiple same exons can appear in a gff file. p_exons = [] ## For predicted annotation a_gene = [] p_gene = [] a_mrna = [] p_mrna = [] exon_pred_all = set() gene_true = set() mrna_true = set() a_base = 0 p_base = 0 true_base = 0 chr = [] for asm_id in assemblies_1: ## Iterate through each chromosome from the known ref annotation assembly_1 = assemblies_1[asm_id] assembly_2 = assemblies_2.get(asm_id,-1) ## Find that chromosome in the predicted gff file genes_1 = assembly_1.genes() ## All genes from known annotation anno_exons = set() for gene_1 in sorted(genes_1) : ## Add unique gene, mrna , exon features from known annotation to get each known feature total count gene_1_loc = gene_1.location_on(assembly_1) cord = asm_id + ":" + str(gene_1_loc.fmin) + ":" + str(gene_1_loc.fmax)+ ":" + str(gene_1_loc.strand) ## Use chromosome id+start+stop+strand as a string to determine uniqueness. if (cord not in a_gene) : a_gene.append(cord) for mrna_1 in sorted(gene_1.mRNAs()) : mrna_1_loc = mrna_1.location_on(assembly_1) cord = asm_id + ":" + str(mrna_1_loc.fmin) + ":" + str(mrna_1_loc.fmax) + ":" + str(mrna_1_loc.strand) if (cord not in a_mrna) : a_mrna.append(cord) for exon_1 in sorted(mrna_1.exons()) : exon_1_loc = exon_1.location_on(assembly_1) cord = asm_id + ":" + str(exon_1_loc.fmin) + ":" + str(exon_1_loc.fmax) + ":" + str(exon_1_loc.strand) if (cord not in a_exons) : a_exons.append(cord) anno_exons.add(cord) if (type(assembly_2) is int) : ## If the chromosome is not found in prediected file, move to next chromosome. continue genes_2 = assembly_2.genes() ## All genes from predicted annotation. chr.append(asm_id) ## Append all found chromosome in a list. pred_exons = set() for gene_2 in sorted(genes_2) : ## Add unique gene, mrna , exon features from predicted annotation to get each predicted feature total count. gene_2_loc = gene_2.location_on(assembly_2) cord = asm_id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" + str(gene_2_loc.strand) if (cord not in p_gene) : p_gene.append(cord) for mrna_2 in sorted(gene_2.mRNAs()) : mrna_2_loc = mrna_2.location_on(assembly_2) cord = asm_id + ":" + str(mrna_2_loc.fmin) + ":" + str(mrna_2_loc.fmax)+ ":" + str(mrna_2_loc.strand) if (cord not in p_mrna) : p_mrna.append(cord) for exon_2 in sorted(mrna_2.exons()) : exon_2_loc = exon_2.location_on(assembly_2) cord = asm_id + ":" + str(exon_2_loc.fmin) + ":" + str(exon_2_loc.fmax)+ ":" + str(exon_2_loc.strand) pred_exons.add(cord) if (cord not in p_exons) : p_exons.append(cord) exon_pred_all.update(pred_exons.intersection(anno_exons)) # true exons for gene_2 in sorted(genes_2) : ## From the predicted feature determine the true once. Iterate through each predicted gene sorted by cordinate gene_2_loc = gene_2.location_on(assembly_2) cord_g = asm_id + ":"+ str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" + str(gene_2_loc.strand) if (cord_g in gene_true) : ## To prevent duplication, check if the feature already exists in the set of truly predicted gene. continue ex_mrna1 = set() ex_mrna2 = set() for gene_1 in sorted(genes_1) : gene_1_loc = gene_1.location_on(assembly_1) if (gene_1_loc.strand != gene_2_loc.strand) : continue if (gene_2.overlaps_with(gene_1)) : for mrna_2 in sorted(gene_2.mRNAs()) : for exon_2 in sorted(mrna_2.exons()) : exon_2_loc = exon_2.location_on(assembly_2) cord2 = asm_id + ":" + str(exon_2_loc.fmin) + ":" + str(exon_2_loc.fmax) + ":" + str(exon_2_loc.strand) ex_mrna2.add(cord2) for mrna_1 in sorted(gene_1.mRNAs()) : for exon_1 in sorted(mrna_1.exons()) : exon_1_loc = exon_1.location_on(assembly_1) cord1 = asm_id + ":" + str(exon_1_loc.fmin) + ":" + str(exon_1_loc.fmax) + ":" + str(exon_1_loc.strand) ex_mrna1.add(cord1) ex_union = ex_mrna1.union(ex_mrna2) if (len(ex_union) == len(ex_mrna1) and len(ex_union) == len(ex_mrna2)) : gene_true.add(cord_g) break for asm_id in assemblies_2: ## Iterate through each chromosome from the predicted annotation if asm_id not in chr : assembly_2 = assemblies_2.get(asm_id,-1) ## Find that chromosome in the predicted gff file which is not found in known annotation genes_2 = assembly_2.genes() ## Add genes, mrna, exon features from predicted annotation to total predicted feature set. for gene_2 in sorted(genes_2) : gene_2_loc = gene_2.location_on(assembly_2) cord = asm_id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":"+ str(gene_2_loc.strand) if (cord not in p_gene) : p_gene.append(cord) for mrna_2 in sorted(gene_2.mRNAs()) : mrna_2_loc = mrna_2.location_on(assembly_2) cord = asm_id + ":" + str(mrna_2_loc.fmin) + ":" + str(mrna_2_loc.fmax) + ":" + str(mrna_2_loc.strand) if (cord not in p_mrna) : p_mrna.append(cord) for exon_2 in sorted(mrna_2.exons()) : exon_2_loc = exon_2.location_on(assembly_2) cord = asm_id + ":" + str(exon_2_loc.fmin) + ":" + str(exon_2_loc.fmax) + ":" + str(exon_2_loc.strand) if (cord not in p_exons) : p_exons.append(cord) exon2_bed = args.output_dir + '/exon_2.bed' e_bed = open(exon2_bed, 'w') for exon in p_exons : chrom = (exon.split(':'))[0] start = int((exon.split(':'))[1]) stop = int((exon.split(':'))[2]) strand = (exon.split(':'))[3] if (strand == str(1)) : strand = "+" else : strand = "-" e_bed.write(chrom+"\t"+str(start)+"\t"+str(stop)+"\texon\t"+str(0)+"\t"+strand+"\n") e_bed.close() out2 = args.output_dir + '/exon_2_merged.bed' cmd = "bedtools merge -nms -scores sum -i " + exon2_bed + " -s >"+out2 #print(cmd) os.system(cmd) exon1_bed = args.output_dir + '/exon_1.bed' e_bed = open(exon1_bed, 'w') for exon in a_exons : chrom = (exon.split(':'))[0] start = int((exon.split(':'))[1]) stop = int((exon.split(':'))[2]) strand = (exon.split(':'))[3] if (strand == str(1)) : strand = "+" else : strand = "-" e_bed.write(chrom+"\t"+str(start)+"\t"+str(stop)+"\texon\t"+str(0)+"\t"+strand+"\n") e_bed.close() out1 = args.output_dir + '/exon_1_merged.bed' cmd = "bedtools merge -nms -scores sum -i " + exon1_bed + " -s >"+out1 #print(cmd) os.system(cmd) out_intersect = args.output_dir + '/exon_1_2_intersect.bed' cmd = "bedtools intersect -s -wo -a " + out1 + " -b " + out2 + " >" + out_intersect #print(cmd) os.system(cmd) a_base_file = open(out1,'r') for line in a_base_file : arr = line.split("\t") a_base = a_base + (int(arr[2]) - int(arr[1])) a_base_file.close() p_base_file = open(out2,'r') for line in p_base_file : arr = line.split("\t") p_base = p_base + (int(arr[2]) - int(arr[1])) p_base_file.close() true_base_file = open(out_intersect,'r') for line in true_base_file : arr = line.split("\t") true_base = true_base + int(arr[12]) true_base_file.close() #Calculate SN/SP for bases base_sn = (true_base/a_base) * 100 base_sp = (true_base/p_base) * 100 #Calculate SN/SP for exons annotated_exon = len(a_exons) predicted_exon = len(p_exons) true_pred_exon = len(exon_pred_all) exon_sn = (true_pred_exon/annotated_exon) * 100 exon_sp = (true_pred_exon/predicted_exon) * 100 #Calculate SN/SP for transcript #annotated_mrna = len(a_mrna) #predicted_mrna = len(p_mrna) #true_pred_mrna = len(mrna_true) #mrna_sn = (true_pred_mrna/annotated_mrna) * 100 #mrna_sp = (true_pred_mrna/predicted_mrna) * 100 #Calculate SN/SP for genes annotated_gene = len(a_gene) predicted_gene = len(p_gene) true_pred_gene = len(gene_true) temp_file7 = args.output_dir + '/true_gene.temp7.txt' ft7 = open(temp_file7,'w') for g in gene_true : ft7.write(g+"\n") gene_sn = (true_pred_gene/annotated_gene) * 100 gene_sp = (true_pred_gene/predicted_gene) * 100 print("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n") print("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp)) #print("mRNA\t"+str(annotated_mrna)+"\t"+str(predicted_mrna)+"\t"+str(true_pred_mrna)+"\t"+str(mrna_sn)+"\t"+str(mrna_sp)) print("Exon\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp)) print("Base\t"+str(a_base)+"\t"+str(p_base)+"\t"+str(true_base)+"\t"+str(base_sn)+"\t"+str(base_sp)) out_file = args.output_dir + '/summary.txt' if not (os.path.exists(args.output_dir)) : sys.exit("Directory does not exist.") fout = open(out_file,'w') fout.write("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n") fout.write("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp)+"\n") # fout.write("mRNA\t"+str(annotated_mrna)+"\t"+str(predicted_mrna)+"\t"+str(true_pred_mrna)+"\t"+str(mrna_sn)+"\t"+str(mrna_sp)+"\n") fout.write("Exon\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp)+"\n") fout.write("Base\t"+str(a_base)+"\t"+str(p_base)+"\t"+str(true_base)+"\t"+str(base_sn)+"\t"+str(base_sp)+"\n\n") new_gene = 0 gene_merge = 0 gene_found = 0 gene_split = 0 gene_missing = 0 altered_pred = 0 altered_known = 0 gene = 0 temp_file1 = args.output_dir + '/pred_new.txt' temp_file2 = args.output_dir + '/pred_merged.txt' temp_file3 = args.output_dir + '/pred_1.txt' temp_file4 = args.output_dir + '/known_split.txt' temp_file5 = args.output_dir + '/known_1.txt' temp_file6 = args.output_dir + '/known_missed.txt' temp_file8 = args.output_dir + '/pred_altered.txt' temp_file9 = args.output_dir + '/known_altered.txt' ft1 = open(temp_file1,'w') ft2 = open(temp_file2,'w') ft3 = open(temp_file3,'w') ft4 = open(temp_file4,'w') ft5 = open(temp_file5,'w') ft6 = open(temp_file6,'w') ft8 = open(temp_file8,'w') ft9 = open(temp_file9,'w') for gene2 in p_gene : gene_overlap = [] chrom2 = (gene2.split(':'))[0] start2 = int((gene2.split(':'))[1]) stop2 = int((gene2.split(':'))[2]) strand2 = (gene2.split(':'))[3] for gene1 in a_gene: chrom1 = (gene1.split(':'))[0] start1 = int((gene1.split(':'))[1]) stop1 = int((gene1.split(':'))[2]) strand1 = (gene1.split(':'))[3] if (chrom1 != chrom2) : continue if (strand1 != strand2) : continue if (start1 > stop2) : break if(start1 <= stop2 and start2 <= stop1) : arr = [start1,stop1,start2,stop2] arr.sort() len_overlap = arr[2] - arr[1] per_overlap = (len_overlap/(stop1 - start1)) * 100 gene_overlap.append(per_overlap) if (len(gene_overlap) == 0) : new_gene += 1 ft1.write(gene2+"\n") if (len(gene_overlap) > 1) : true_overlap = 0 for overlap in gene_overlap : if(overlap >= 50) : true_overlap += 1; if (true_overlap >= 2) : gene_merge += 1 ft2.write(gene2+"\n") else : altered_pred += 1; ft8.write(gene2+"\n") if (len(gene_overlap) == 1) : gene_found += 1 ft3.write(gene2+"\n") for gene1 in a_gene : gene_overlap = [] chrom1 = (gene1.split(':'))[0] start1 = int((gene1.split(':'))[1]) stop1 = int((gene1.split(':'))[2]) strand1 = (gene1.split(':'))[3] for gene2 in p_gene: chrom2 = (gene2.split(':'))[0] start2 = int((gene2.split(':'))[1]) stop2 = int((gene2.split(':'))[2]) strand2 = (gene2.split(':'))[3] if (chrom1 != chrom2) : continue if (strand1 != strand2) : continue if (start2 > stop1) : break if(start1 <= stop2 and start2 <= stop1) : arr = [start1,stop1,start2,stop2] arr.sort() len_overlap = arr[2] - arr[1] per_overlap = (len_overlap/(stop2 - start2)) * 100 gene_overlap.append(per_overlap) if (len(gene_overlap) > 1) : true_overlap = 0 for overlap in gene_overlap : if(overlap >= 50) : true_overlap += 1; if (true_overlap >= 2) : gene_split += 1 ft4.write(gene1+"\n") else : altered_known += 1 ft9.write(gene1+"\n") if (len(gene_overlap) == 1) : gene += 1 ft5.write(gene1+"\n") if (len(gene_overlap) == 0) : gene_missing += 1 ft6.write(gene1+"\n") print ("1. No. of predicted gene overlapping 0 known gene (new gene): ",new_gene) print ("2. No. of predicted gene overlapping > 1 known gene by at least 50%: ",gene_merge) print ("3. No. of altered predicted gene: ",altered_pred) print ("4. No. of predicted gene overlaping 1 known gene : ",gene_found) print ("5. No. of known gene overlapping > 1 predicted gene by at least 50% : ",gene_split) print ("6. No. of altered known gene: ",altered_known) print ("7. No. of known gene overlapping 1 predicted gene : ",gene) print ("8. No. of known gene overlapping 0 predicted gene (gene missing) : ",gene_missing) fout.write ("1. No. of predicted gene overlapping 0 known gene (new gene): "+str(new_gene)+"\n") fout.write ("2. No. of predicted gene overlapping > 1 known gene by at least 50%: "+str(gene_merge)+"\n") fout.write ("3. No. of altered predicted gene: "+str(altered_pred)+"\n") fout.write ("4. No. of predicted gene overlaping 1 known gene : "+str(gene_found)+"\n") fout.write ("5. No. of known gene overlapping > 1 predicted gene by at least 50% : "+str(gene_split)+"\n") fout.write ("6. No. of altered known gene: "+str(altered_known)+"\n") fout.write ("7. No. of known gene overlapping 1 predicted gene : "+str(gene)+"\n") fout.write ("8. No. of known gene overlapping 0 predicted gene (gene missing) : "+str(gene_missing)+"\n") fout.close() ft1.close() ft2.close() ft3.close() ft4.close() ft5.close() ft6.close() ft7.close() ft8.close() ft9.close() #Clean up cmd = "rm " + args.output_dir + "/*.bed" os.system(cmd)
def main(): gm_es_file = 'genemark_hmm.gff3' cegma_file = 'output.cegma.gff3' #aat_file = 'bail_training_genes.aat.1500maxintron.80percid.gff3' aat_file = 'aat.bail_hominis_filtered_training.gff3' #aat_file = 'aat.merged.gff3' print("INFO: parsing Genemark-ES data") (assemblies, gm_es_features) = biocodegff.get_gff3_features(gm_es_file) gm_es_genes = get_genes_from_dict(gm_es_features) print("\tINFO: Got {0} Genemark-ES genes".format(len(gm_es_genes))) print("INFO: parsing CEGMA data") (assemblies, cegma_features) = biocodegff.get_gff3_features(cegma_file, assemblies=assemblies) cegma_genes = get_genes_from_dict(cegma_features) print("\tINFO: Got {0} CEGMA genes".format(len(cegma_genes))) print("INFO: parsing AAT results") (assemblies, aat_muris_features) = biocodegff.get_gff3_features(aat_file, assemblies=assemblies) aat_genes = get_genes_from_dict(aat_muris_features) print("\tINFO: Got {0} AAT 'genes'".format(len(aat_genes))) genemark_cegma_shared_genes = list() gmes_cegma_fh = open('gmes_cegma.shared.ids', 'wt') for gm_es_gene in gm_es_genes: for cegma_gene in cegma_genes: if gm_es_gene.has_same_coordinates_as(thing=cegma_gene): if gm_es_gene.shares_exon_structure_with( thing=cegma_gene) == True: genemark_cegma_shared_genes.append(gm_es_gene) gmes_cegma_fh.write("{0}\n".format(gm_es_gene.id)) break print("\n{0} genes were shared perfectly between Genemark-ES and CEGMA". format(len(genemark_cegma_shared_genes))) ############################################################################# genemark_aat_shared_genes = list() gmes_aat_fh = open('gmes_aat.shared.ids', 'wt') for gm_es_gene in gm_es_genes: for aat_gene in aat_genes: if gm_es_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True) == True: genemark_aat_shared_genes.append(gm_es_gene) gmes_aat_fh.write("{0}\n".format(gm_es_gene.id)) break print("{0} Genemark-ES genes had an exact AAT match".format( len(genemark_aat_shared_genes))) ############################################################################## cegma_matching_gm_es = list() genemark_aat_cegma_shared_genes = list() gmes_aat_cegma_fh = open('gmes_aat_cegma.shared.ids', 'wt') for cegma_gene in cegma_genes: match_found = False for gm_es_gene in gm_es_genes: if cegma_gene.has_same_coordinates_as(thing=gm_es_gene): if cegma_gene.shares_exon_structure_with( thing=gm_es_gene) == True: match_found = True if gm_es_gene in genemark_aat_shared_genes and gm_es_gene not in genemark_aat_cegma_shared_genes: genemark_aat_cegma_shared_genes.append(gm_es_gene) gmes_aat_cegma_fh.write("{0}\n".format(gm_es_gene.id)) break if match_found == True: cegma_matching_gm_es.append(cegma_gene) print("{0} genes with GeneMark-ES, CEGMA and AAT agreement".format( len(genemark_aat_cegma_shared_genes))) training_fh = open('training_gene.ids', 'wt') for gene in genemark_aat_cegma_shared_genes: training_fh.write("{0}\n".format(gene.id)) ############################################################################## cegma_with_aat_not_gm_es = list() cegma_aat_nogmes_fh = open('cegma_aat_nogmes.shared.ids', 'wt') for cegma_gene in cegma_genes: if cegma_gene in cegma_matching_gm_es: continue for aat_gene in aat_genes: if cegma_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True) == True: cegma_with_aat_not_gm_es.append(cegma_gene) cegma_aat_nogmes_fh.write("{0}\n".format(cegma_gene.id)) break print("{0} CEGMA genes had no GeneMark-ES match but did have an AAT one". format(len(cegma_with_aat_not_gm_es)))
def main(): parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created' ) parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export') parser.add_argument('-f', '--fasta', type=str, required=False, help='If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' ) parser.add_argument('--check_ends', dest='check_ends', action='store_true') parser.set_defaults(check_ends=False) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # only doing the standard codon table for now start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] ## add sequence residues from external FASTA file if the user passed one if args.fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): ## initial values of id and header to export (can be overridden by available annotation) export_id = mRNA.id export_header = None if mRNA.locus_tag is not None: export_id = mRNA.locus_tag ## Add the gene product name if there is one for polypeptide in mRNA.polypeptides(): if polypeptide.annotation is not None: if polypeptide.annotation.product_name is not None: export_header = polypeptide.annotation.product_name break fout.write(">{0}".format(export_id)) if export_header is not None: fout.write(" {0}\n".format(export_header)) else: fout.write("\n") coding_seq = mRNA.get_CDS_residues(for_translation=True) if args.check_ends == True: # check the starting codon start_codon = coding_seq[0:3].upper() if start_codon not in start_codons: sys.stderr.write("WARN: Non-canonical start codon ({0}) in mRNA {1}\n".format(start_codon, mRNA.id)) stop_codon = coding_seq[-3:].upper() if stop_codon not in stop_codons: sys.stderr.write("WARN: Non-canonical stop codon ({0}) in mRNA {1}\n".format(stop_codon, mRNA.id)) if args.type == 'cds': fout.write("{0}\n".format(biocodeutils.wrapped_fasta(coding_seq))) else: translated_seq = biocodeutils.translate(coding_seq) fout.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq)))
def main(): parser = argparse.ArgumentParser( description= 'Generates a set of transcripts based on a user-defined exon-complexity profile' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read') parser.add_argument('-o', '--output_file', type=str, required=True, help='Output ID list file to create') parser.add_argument('-ni', '--not_included', type=str, required=False, help='Writes the ID list of genes not included') parser.add_argument('-c', '--count', type=int, required=True, help='Count of transcripts to pull') parser.add_argument('-e', '--exclude', type=str, required=False, help='List of IDs to exclude') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) exclude = list() if args.exclude is not None: for line in open(args.exclude): line = line.rstrip() exclude.append(line) ofh = open(args.output_file, 'wt') if args.not_included is None: ni_ofh = None else: ni_ofh = open(args.not_included, 'wt') # profile for 99-892 #profile = { 1:12.2, 2:13.2, 3:15.4, 4:14.1, 5:11.5, 6:8.77, 7:6.99, 8:4.74, 9:3.81, 10:2.45 } # profile for 99-880 #profile = { 1:19.7, 2:17.9, 3:20.6, 4:15.6, 5:10.1, 6:6.62, 7:4.14, 8:1.58, 9:2.07 } # c. hominis TU502 #profile = { 1:89.7, 2:6.8, 3:2.6, 4:0.78 } # c. baileyi TAMU 10GZ1 #profile = { 1:85.6 , 2:9.85 , 3:4.06 , 4:0.25 , 5:0.25 } profile = {1: 85.6, 2: 9.85, 3: 4.06} mRNAs = dict() unselected_mRNAs = dict() target = dict() selected = dict() for CDS_count in profile: target[CDS_count] = math.trunc(args.count * (profile[CDS_count] / 100)) selected[CDS_count] = list() mRNAs[CDS_count] = list() unselected_mRNAs[CDS_count] = list() # fill the bins of each target size, then fill a reservoir to select any additional ones from reservoir = list() total_mRNAs_selected = 0 total_mRNA_count = 0 for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): total_mRNA_count += 1 if mRNA.id in exclude: continue CDS_count = mRNA.CDS_count() if CDS_count not in profile: reservoir.append(mRNA) continue mRNAs[CDS_count].append(mRNA) for CDS_count in profile: # make sure this many were found if target[CDS_count] <= len(mRNAs[CDS_count]): selected[CDS_count] = random.sample(mRNAs[CDS_count], target[CDS_count]) else: print( "WARN: Not enough mRNAs of length {0} to meet profile request". format(CDS_count)) selected[CDS_count] = mRNAs[CDS_count] total_mRNAs_selected += len(selected[CDS_count]) unselected_mRNAs[CDS_count] = list( set(mRNAs[CDS_count]) & set(set(mRNAs[CDS_count]) ^ set(selected[CDS_count]))) for mRNA in unselected_mRNAs[CDS_count]: reservoir.append(mRNA) print("INFO: selected CDS profile:") for ccount in sorted(selected): print("CDS_count:{0}, target:{3}, gathered:{1}, unselected:{4}, target_perc:{2}".format( \ ccount, len(selected[ccount]), (len(selected[ccount])/target[ccount]), target[ccount], \ len(unselected_mRNAs[ccount]) \ ) ) for mRNA in selected[ccount]: ofh.write("{0}\n".format(mRNA.parent.id)) print("Total selected according to profile: {0}".format( total_mRNAs_selected)) # now, from the rounding portions fill the rest randomly sample_from_reservoir = random.sample(reservoir, args.count - total_mRNAs_selected) sample_ids_from_reservoir = list() for mRNA in sample_from_reservoir: sample_ids_from_reservoir.append(mRNA.id) reservoir.remove(mRNA) ofh.write("\n".join(sample_ids_from_reservoir)) ofh.write("\n") total_mRNAs_selected += len(sample_from_reservoir) print("Total selected randomly afterwards: {0}".format( len(sample_from_reservoir))) if ni_ofh is not None: for mRNA in reservoir: ni_ofh.write("{0}\n".format(mRNA.parent.id))
def process_files(args): (assemblies_1, features_1) = biocodegff.get_gff3_features(args.annotation_1) (assemblies_2, features_2) = biocodegff.get_gff3_features(args.annotation_2) a_exons = [] ## Set contains only uniq exons from known annotation, since multiple same exons can appear in a gff file. p_exons = [] ## For predicted annotation a_gene = [] p_gene = [] a_mrna = [] p_mrna = [] exon_pred_all = set() gene_true = set() mrna_true = set() chr = [] a_cds = [] p_cds = [] a_cd = [] p_cd= [] chr = [] true_pred_file = args.output_dir + '/true_predicted_genes.txt' true_file = open(true_pred_file,'w') true_file.write("Known\tPredicted\n") for asm_id in assemblies_1: ## Iterate through each chromosome from the known ref annotation assembly_1 = assemblies_1[asm_id] assembly_2 = assemblies_2.get(asm_id,-1) ## Find that chromosome in the predicted gff file genes_1 = assembly_1.genes() ## All genes from known annotation anno_exons = set() for gene_1 in sorted(genes_1) : ## Add unique gene, mrna , exon features from known annotation to get each known feature total count gene_1_loc = gene_1.location_on(assembly_1) cord_a = cordinate(asm_id,gene_1_loc) ## Use chromosome id+start+stop+strand as a string to determine uniqueness. if (cord_a not in a_gene) : a_gene.append(cord_a) ex_start = [] ex_stop = [] for mrna_1 in sorted(gene_1.mRNAs()) : mrna_1_loc = mrna_1.location_on(assembly_1) cord = cordinate(asm_id,mrna_1_loc) if (cord not in a_mrna) : a_mrna.append(cord) if (args.feature == "Exon") : feat_1 = mrna_1.exons() if (args.feature == "CDS") : feat_1 = mrna_1.CDSs() for exon_1 in sorted(feat_1) : exon_1_loc = exon_1.location_on(assembly_1) cord = cordinate(asm_id, exon_1_loc) if (cord not in a_exons) : a_exons.append(cord) anno_exons.add(cord) ex_start.append(exon_1_loc.fmin) ex_stop.append(exon_1_loc.fmax) ex_start.sort() ex_stop.sort() if (len(ex_start) >= 1) : cds1 = asm_id + ":" + gene_1.id + ":" + str(ex_start[0]) + ":" + str(ex_stop[-1]) + ":" + str(gene_1_loc.strand) else : cds1 = asm_id + ":" + gene_1.id + ":" + str(gene_1_loc.fmin) + ":" + str(gene_1_loc.fmax) + ":" + str(gene_1_loc.strand) if (cord_a not in a_cd) : a_cds.append(cds1) a_cd.append(cord_a) if (type(assembly_2) is int) : ## If the chromosome is not found in prediected file, move to next chromosome. continue genes_2 = assembly_2.genes() ## All genes from predicted annotation. chr.append(asm_id) ## Append all found chromosome in a list. pred_exons = set() for gene_2 in sorted(genes_2) : ## Add unique gene, mrna , exon features from predicted annotation to get each predicted feature total count. gene_2_loc = gene_2.location_on(assembly_2) cord_p = cordinate(asm_id, gene_2_loc) if (cord_p not in p_gene) : p_gene.append(cord_p) ex_start = [] ex_stop = [] for mrna_2 in sorted(gene_2.mRNAs()) : mrna_2_loc = mrna_2.location_on(assembly_2) cord = cordinate(asm_id, mrna_2_loc) if (cord not in p_mrna) : p_mrna.append(cord) if (args.feature == "Exon") : feat_2 = mrna_2.exons() if (args.feature == "CDS") : feat_2 = mrna_2.CDSs() for exon_2 in sorted(feat_2) : exon_2_loc = exon_2.location_on(assembly_2) cord = cordinate(asm_id ,exon_2_loc) pred_exons.add(cord) if (cord not in p_exons) : p_exons.append(cord) ex_start.append(exon_2_loc.fmin) ex_stop.append(exon_2_loc.fmax) ex_start.sort() ex_stop.sort() if (len(ex_start) >= 1) : cds2 = asm_id + ":" + gene_2.id + ":" + str(ex_start[0]) + ":" + str(ex_stop[-1]) + ":" + str(gene_2_loc.strand) else : cds2 = asm_id + ":" + gene_2.id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" + str(gene_2_loc.strand) if (cord_p not in p_cd) : p_cds.append(cds2) p_cd.append(cord_p) exon_pred_all.update(pred_exons.intersection(anno_exons)) # true exons for gene_2 in sorted(genes_2) : ## From the predicted feature determine the true once. Iterate through each predicted gene sorted by cordinate gene_2_loc = gene_2.location_on(assembly_2) cord_g = cordinate(asm_id, gene_2_loc) if (cord_g in gene_true) : ## To prevent duplication, check if the feature already exists in the set of truly predicted gene. continue ex_mrna1 = set() ex_mrna2 = set() for gene_1 in sorted(genes_1) : gene_1_loc = gene_1.location_on(assembly_1) if (gene_1_loc.strand != gene_2_loc.strand) : continue if (gene_2.overlaps_with(gene_1)) : for mrna_2 in sorted(gene_2.mRNAs()) : if (args.feature == "Exon") : feat_2 = mrna_2.exons() if (args.feature == "CDS") : feat_2 = mrna_2.CDSs() for exon_2 in sorted(feat_2) : exon_2_loc = exon_2.location_on(assembly_2) cord2 = cordinate(asm_id , exon_2_loc) ex_mrna2.add(cord2) for mrna_1 in sorted(gene_1.mRNAs()) : if (args.feature == "Exon") : feat_1 = mrna_1.exons() if (args.feature == "CDS") : feat_1 = mrna_1.CDSs() for exon_1 in sorted(feat_1) : exon_1_loc = exon_1.location_on(assembly_1) cord1 = cordinate(asm_id, exon_1_loc) ex_mrna1.add(cord1) ex_union = ex_mrna1.union(ex_mrna2) if (len(ex_union) == len(ex_mrna1) and len(ex_union) == len(ex_mrna2)) : gene_true.add(cord_g) true_file.write(gene_1.id+"\t"+gene_2.id+"\n") break for asm_id in assemblies_2: ## Iterate through each chromosome from the predicted annotation if asm_id not in chr : assembly_2 = assemblies_2.get(asm_id,-1) ## Find that chromosome in the predicted gff file which is not found in known annotation genes_2 = assembly_2.genes() ## Add genes, mrna, exon features from predicted annotation to total predicted feature set. for gene_2 in sorted(genes_2) : gene_2_loc = gene_2.location_on(assembly_2) cord_p = cordinate(asm_id ,gene_2_loc) if (cord_p not in p_gene) : p_gene.append(cord_p) ex_start = [] ex_stop = [] for mrna_2 in sorted(gene_2.mRNAs()) : mrna_2_loc = mrna_2.location_on(assembly_2) cord = cordinate(asm_id , mrna_2_loc) if (cord not in p_mrna) : p_mrna.append(cord) if (args.feature == "Exon") : feat_2 = mrna_2.exons() if (args.feature == "CDS") : feat_2 = mrna_2.CDSs() for exon_2 in sorted(feat_2) : exon_2_loc = exon_2.location_on(assembly_2) cord = cordinate(asm_id ,exon_2_loc) if (cord not in p_exons) : p_exons.append(cord) ex_start.append(exon_2_loc.fmin) ex_stop.append(exon_2_loc.fmax) ex_start.sort() ex_stop.sort() if (len(ex_start) >= 1) : cds2 = asm_id + ":" + gene_2.id + ":" + str(ex_start[0]) + ":" + str(ex_stop[-1]) + ":" + str(gene_2_loc.strand) else : cds2 = asm_id + ":" + gene_2.id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" + str(gene_2_loc.strand) if (cord_p not in p_cd) : p_cds.append(cds2) p_cd.append(cord_p) #Calculate SN/SP for bases (a_base_val, p_base_val, true_base) = base_comparison(p_exons,a_exons) base_sn = (true_base/a_base_val) * 100 base_sp = (true_base/p_base_val) * 100 #Calculate SN/SP for exons annotated_exon = len(a_exons) predicted_exon = len(p_exons) true_pred_exon = len(exon_pred_all) exon_sn = (true_pred_exon/annotated_exon) * 100 exon_sp = (true_pred_exon/predicted_exon) * 100 #Calculate SN/SP for genes annotated_gene = len(a_gene) predicted_gene = len(p_gene) true_pred_gene = len(gene_true) gene_sn = (true_pred_gene/annotated_gene) * 100 gene_sp = (true_pred_gene/predicted_gene) * 100 print("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n") print("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp)) print(args.feature+"\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp)) print("Base\t"+str(a_base_val)+"\t"+str(p_base_val)+"\t"+str(true_base)+"\t"+str(base_sn)+"\t"+str(base_sp)) out_file = args.output_dir + '/summary.txt' if not (os.path.exists(args.output_dir)) : sys.exit("Directory does not exist.") fout = open(out_file,'w') fout.write("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n") fout.write("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp)+"\n") fout.write(args.feature+"\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp)+"\n") fout.write("Base\t"+str(a_base_val)+"\t"+str(p_base_val)+"\t"+str(true_base)+"\t"+str(base_sn)+"\t"+str(base_sp)+"\n\n") arr_pred = compare_cds(p_cds,a_cds,"pred") arr_known = compare_cds(a_cds,p_cds,"known") arr_pred_same = compare_cds(p_cds,p_cds,"pred_same") new_gene = arr_pred[2] gene_merge = arr_pred[3] gene_found = arr_pred[0] gene_opp = arr_pred[1] gene_missing = arr_known[2] gene = arr_known[0] gene_opp_known = arr_known[1] gene_split = arr_known[3] gene_pred_overlap_opp = arr_pred_same[1] print ("1. No. of known gene : ",len(a_cds)) print ("2. No. of predicted gene : ",len(p_cds)) print ("3. No. of predicted gene overlapping 0 known gene (new gene): ",new_gene) print ("4. No. of predicted gene overlapping > 1 known gene (gene merge) : ",gene_merge) print ("5. No. of predicted gene overlaping 1 known gene : ",gene_found) print ("6. No. of predicted gene overlapping >= 1 known gene in opp strand : ",gene_opp) print ("7. No. of predicted gene overlapping 1 known gene (exact intron/exon boundaries) : ",true_pred_gene) print ("8. No. of predicted gene overlapping >= 1 predicted gene in opp strand : ",gene_pred_overlap_opp) print ("9. No. of known gene overlapping 0 predicted gene (gene missing): ",gene_missing) print ("10. No. of known gene overlapping > 1 predicted gene(gene split) : ",gene_split) print ("11. No. of known gene overlaping 1 predicted gene : ",gene) print ("12. No. of known gene overlapping >= 1 predicted gene in opp strand : ",gene_opp_known) out_file = args.output_dir + '/final_stats.txt' if not (os.path.exists(args.output_dir)) : sys.exit("Directory does not exist.") fout = open(out_file,'w') fout.write ("1. No. of known gene : " + str(len(a_cds)) + "\n") fout.write ("2. No. of predicted gene : " + str(len(p_cds)) + "\n") fout.write ("3. No. of predicted gene overlapping 0 known gene (new gene): " + str(new_gene) + "\n") fout.write ("4. No. of predicted gene overlapping > 1 known gene (gene merge) : " + str(gene_merge) + "\n") fout.write ("5. No. of predicted gene overlaping 1 known gene : " + str(gene_found) + "\n") fout.write ("6. No. of predicted gene overlapping >= 1 known gene in opp strand : " + str(gene_opp) + "\n") fout.write ("7. No. of predicted gene overlapping 1 known gene (exact intron/exon boundary) : " + str(true_pred_gene) + "\n") fout.write ("8. No. of predicted gene overlapping >= 1 predicted gene in opp strand : " + str(gene_pred_overlap_opp) + "\n") fout.write ("9. No. of known gene overlapping 0 predicted gene (gene missing): " + str(gene_missing) + "\n") fout.write ("10. No. of known gene overlapping > 1 predicted gene (gene_split): " + str(gene_split) + "\n") fout.write ("11. No. of known gene overlaping 1 predicted gene : " + str(gene) + "\n") fout.write ("12. No. of known gene overlapping >= 1 predicted gene in opp strand : " + str(gene_opp_known) + "\n") true_pred_file = args.output_dir + '/true_pred.txt' fout_true = open(true_pred_file,'w') for true_gene in gene_true : fout_true.write(true_gene+"\n") #Clean up delete_file = ['exon_1.bed','exon_2.bed','exon_1_merged.bed','exon_2_merged.bed','exon_1_2_intersect.bed'] for f in delete_file : cmd = "rm " + args.output_dir + "/" + f os.system(cmd)
def main(): parser = argparse.ArgumentParser( description='Parses multiple sources of evidence to generate a consensus functional annotation') ## output file to be written parser.add_argument('-f', '--input_fasta', type=str, required=True, help='Protein FASTA file of source molecules' ) parser.add_argument('-m', '--hmm_htab_list', type=str, required=False, help='List of htab files from hmmpfam3' ) parser.add_argument('-bs', '--blast_sprot_btab_list', type=str, required=False, help='List of btab files from BLAST against UniProtKB/SWISS-PROT' ) parser.add_argument('-rs', '--rapsearch_sprot_btab_list', type=str, required=False, help='List of m8 files from RAPSEARCH2 against UniProtKB/SWISS-PROT' ) parser.add_argument('-bt', '--blast_trembl_btab_list', type=str, required=False, help='List of btab files from BLAST against UniProtKB/Trembl' ) parser.add_argument('-bk', '--blast_kegg_btab_list', type=str, required=False, help='List of btab files from BLAST against KEGG' ) parser.add_argument('-bu100', '--blast_uniref100_btab_list', type=str, required=False, help='List of btab files from BLAST against UniRef100' ) parser.add_argument('-ru100', '--rapsearch_uniref100_btab_list', type=str, required=False, help='List of m8 files from RAPSEARCH2 against UniRef100' ) parser.add_argument('-u100f', '--uniref100_fasta', type=str, required=False, help='Only required if also passing RAPSEARCH2 against UniRef100 evidence' ) parser.add_argument('-tm', '--tmhmm_raw_list', type=str, required=False, help='List of raw files from a tmhmm search' ) parser.add_argument('-d', '--hmm_db', type=str, required=False, help='SQLite3 db with HMM information' ) parser.add_argument('-u', '--uniprot_sprot_db', type=str, required=False, help='SQLite3 db with UNIPROT/SWISSPROT information' ) parser.add_argument('-ur', '--uniref_db', type=str, required=False, help='SQLite3 db with UNIREF information' ) parser.add_argument('-a', '--format', type=str, required=False, default='tab', help='Output format. Current options are: "tab", "fasta", "gff3"' ) parser.add_argument('-s', '--source_gff', type=str, required=False, help='Source GFF file from which proteins were derived. Required if you want to export any format other than tab-delimited.' ) parser.add_argument('-e', '--blast_eval_cutoff', type=float, required=False, default=1e-5, help='Skip BLAST hits unless they have an E-value at least as low as this' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' ) parser.add_argument('-r', '--organism_table', type=str, required=False, help='Optional table with counts of organism frequency based on top BLAST match for each protein' ) parser.add_argument('-g', '--genomic_fasta', type=str, required=False, help='If passed, the genomic FASTA sequence will be included in the exported GFF3') parser.add_argument('-eon', '--export_organism_names', help='If passed, includes organism names from top BLAST hit into 9th column when available. Mostly useful for metagenomic samples.', action='store_true') args = parser.parse_args() check_arguments(args) # If --rapsearch_uniref100_btab_list passed, --uniref100_fasta is required if args.rapsearch_uniref100_btab_list is not None: if args.uniref100_fasta is None: raise Exception("ERROR: --uniref100_fasta required if --rapsearch_uniref100_btab_list is passed") sources_log_fh = open("{0}.sources.log".format(args.output_file), 'wt') # this is a dict of biothings.Polypeptide objects polypeptides = initialize_polypeptides( sources_log_fh, args.input_fasta ) # Keyed on polypeptide ID (from the FASTA, which is actually the mRNA gff feature ID), the # values here are the organism name for the top BLAST match of each polypeptide_blast_org = dict() # get source structural annotation, if necessary: if args.source_gff is not None: print("INFO: parsing source GFF") (assemblies, features) = biocodegff.get_gff3_features( args.source_gff ) if args.hmm_htab_list is not None: # connection to the HMM-associated SQLite3 database hmm_db_conn = sqlite3.connect(args.hmm_db) hmm_db_curs = hmm_db_conn.cursor() if args.hmm_db is None: raise Exception("ERROR: You specified HMM results but not the db with the -d option") print("INFO: parsing HMM evidence") parse_hmm_evidence( sources_log_fh, polypeptides, args.hmm_htab_list, hmm_db_curs ) hmm_db_curs.close() if args.blast_sprot_btab_list is not None: if args.uniprot_sprot_db is None: raise Exception("ERROR: You specified BLAST evidence vs UnitProt/SwissProt results but not the db with the -u option") # connection to the UniProt_Sprot SQLite3 database usp_db_conn = sqlite3.connect(args.uniprot_sprot_db) usp_db_curs = usp_db_conn.cursor() print("INFO: parsing BLAST (SWISS-PROT) evidence") parse_sprot_blast_evidence( sources_log_fh, polypeptides, polypeptide_blast_org, args.blast_sprot_btab_list, usp_db_curs, args.blast_eval_cutoff, 'blast' ) usp_db_curs.close() if args.rapsearch_sprot_btab_list is not None: if args.uniprot_sprot_db is None: raise Exception("ERROR: You specified RAPSEARCH2 evidence vs UnitProt/SwissProt results but not the db with the -u option") # connection to the UniProt_Sprot SQLite3 database usp_db_conn = sqlite3.connect(args.uniprot_sprot_db) usp_db_curs = usp_db_conn.cursor() print("INFO: parsing RAPSEARCH2 (SWISS-PROT) evidence") parse_sprot_blast_evidence( sources_log_fh, polypeptides, polypeptide_blast_org, args.rapsearch_sprot_btab_list, usp_db_curs, args.blast_eval_cutoff, 'rapsearch2' ) usp_db_curs.close() if args.blast_trembl_btab_list is not None: print("INFO: parsing BLAST (TrEMBL) evidence") parse_trembl_blast_evidence(polypeptides, args.blast_trembl_btab_list, args.blast_eval_cutoff) if args.blast_kegg_btab_list is not None: print("INFO: parsing BLAST (KEGG) evidence") parse_kegg_blast_evidence(sources_log_fh, polypeptides, args.blast_kegg_btab_list, args.blast_eval_cutoff) if args.blast_uniref100_btab_list is not None: print("INFO: parsing BLAST (UniRef100) evidence") # connection to the UniRef SQLite3 database uniref_db_conn = sqlite3.connect(args.uniref_db) uniref_db_curs = uniref_db_conn.cursor() parse_uniref100_blast_evidence(sources_log_fh, polypeptides, args.blast_uniref100_btab_list, uniref_db_curs, args.blast_eval_cutoff, 'blast', args.uniref100_fasta) uniref_db_curs.close() if args.rapsearch_uniref100_btab_list is not None: print("INFO: parsing RAPSEARCH2 (UniRef100) evidence") # connection to the UniRef SQLite3 database uniref_db_conn = sqlite3.connect(args.uniref_db) uniref_db_curs = uniref_db_conn.cursor() parse_uniref100_blast_evidence(sources_log_fh, polypeptides, args.rapsearch_uniref100_btab_list, uniref_db_curs, args.blast_eval_cutoff, 'rapsearch2', args.uniref100_fasta) uniref_db_curs.close() if args.tmhmm_raw_list is not None: print("INFO: parsing TMHMM evidence") parse_tmhmm_evidence(sources_log_fh, polypeptides, args.tmhmm_raw_list) ## output will either be a file or STDOUT print("INFO: writing output") fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') if args.format == 'tab': write_tab_results(fout, polypeptides) elif args.format == 'fasta': write_fasta_results(fout, polypeptides) elif args.format == 'gff3': write_gff3_results(fout, polypeptides, assemblies, features, args.genomic_fasta) fout.close() ## There isn't a method in biocodegff3 to add arbitrary key=value pairs. So we have to cheat here. if args.export_organism_names is True: if args.output_file: append_organism_names_to_gff(args.output_file, polypeptide_blast_org) else: raise Exception("ERROR: an --output_file must be specified when using the --export_organism_names option.") if args.organism_table is not None: create_organism_table(args.organism_table, polypeptide_blast_org)
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('-g', '--genemark', type=str, required=True, help='Path to the results from GeneMark-ES' ) parser.add_argument('-c', '--cegma', type=str, required=True, help='Path to the results from CEGMA, converted to GFF3' ) parser.add_argument('-a', '--aat', type=str, required=True, help='Path to the results from AAT, converted to GFF3' ) parser.add_argument('-e', '--expression', type=str, required=False, help='Any expression data aligned using GMAP (in gff3_gene mode)' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-m', '--max_genes', type=int, required=False, help='Limits gene IDs exported to the top N by strongest evidence class' ) args = parser.parse_args() print("INFO: parsing Genemark-ES data") (assemblies, gm_es_features) = biocodegff.get_gff3_features( args.genemark ) gm_es_genes = get_genes_from_dict(gm_es_features) print("\tINFO: Got {0} Genemark-ES genes".format(len(gm_es_genes))) print("INFO: parsing CEGMA data") (assemblies, cegma_features) = biocodegff.get_gff3_features( args.cegma, assemblies=assemblies ) cegma_genes = get_genes_from_dict(cegma_features) print("\tINFO: Got {0} CEGMA genes".format(len(cegma_genes))) print("INFO: parsing AAT results") (assemblies, aat_features) = biocodegff.get_gff3_features( args.aat, assemblies=assemblies) aat_genes = get_genes_from_dict(aat_features) print("\tINFO: Got {0} AAT 'genes'".format(len(aat_genes))) expression_genes = list() if args.expression is not None: print("INFO: parsing expression results") (assemblies, expression_features) = biocodegff.get_gff3_features( args.expression, assemblies=assemblies) expression_genes = get_genes_from_dict(expression_features) print("\tINFO: Got {0} expression 'genes'".format(len(expression_genes))) genemark_cegma_shared_genes = list() gmes_cegma_fh = open('gmes_cegma.shared.ids', 'wt') for gm_es_gene in gm_es_genes: for cegma_gene in cegma_genes: if gm_es_gene.has_same_coordinates_as( thing=cegma_gene ): if gm_es_gene.shares_exon_structure_with( thing=cegma_gene ) == True: genemark_cegma_shared_genes.append(gm_es_gene) gmes_cegma_fh.write("{0}\n".format(gm_es_gene.id)) break print("\n{0} genes were shared perfectly between Genemark-ES and CEGMA".format(len(genemark_cegma_shared_genes)) ) ############################################################################# genemark_cegma_expression_shared_genes = list() gmes_cegma_exp_fh = open('gmes_cegma_exp.shared.ids', 'wt') for gm_es_gene in genemark_cegma_shared_genes: for exp_gene in expression_genes: if gm_es_gene.shares_CDS_structure_with( exp_gene ): genemark_cegma_expression_shared_genes.append(gm_es_gene) break print("{0} genes were shared perfectly between Genemark-ES and CEGMA and expression data".format(len(genemark_cegma_expression_shared_genes)) ) ############################################################################# genemark_aat_shared_genes = list() gmes_aat_fh = open('gmes_aat.shared.ids', 'wt') for gm_es_gene in gm_es_genes: for aat_gene in aat_genes: if gm_es_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True ) == True: #if gm_es_gene.shares_exon_structure_with( thing=aat_gene ) == True: genemark_aat_shared_genes.append(gm_es_gene) gmes_aat_fh.write("{0}\n".format(gm_es_gene.id)) break print("{0} Genemark-ES genes had an exact AAT match".format(len(genemark_aat_shared_genes)) ) ############################################################################## cegma_matching_gm_es = list() genemark_aat_cegma_shared_genes = list() gmes_aat_cegma_fh = open('gmes_aat_cegma.shared.ids', 'wt') for cegma_gene in cegma_genes: match_found = False for gm_es_gene in gm_es_genes: if cegma_gene.has_same_coordinates_as( thing=gm_es_gene ): if cegma_gene.shares_exon_structure_with( thing=gm_es_gene ) == True: match_found = True if gm_es_gene in genemark_aat_shared_genes and gm_es_gene not in genemark_aat_cegma_shared_genes: genemark_aat_cegma_shared_genes.append(gm_es_gene) gmes_aat_cegma_fh.write("{0}\n".format(gm_es_gene.id)) break if match_found == True: cegma_matching_gm_es.append(cegma_gene) print("{0} genes with GeneMark-ES, CEGMA and AAT agreement".format(len(genemark_aat_cegma_shared_genes)) ) training_fh = open('training_gene.ids', 'wt') for gene in genemark_aat_cegma_shared_genes: training_fh.write("{0}\n".format(gene.id) ) ############################################################################## cegma_with_aat_not_gm_es = list() cegma_aat_nogmes_fh = open('cegma_aat_nogmes.shared.ids', 'wt') for cegma_gene in cegma_genes: if cegma_gene in cegma_matching_gm_es: continue for aat_gene in aat_genes: #if cegma_gene.shares_exon_structure_with( thing=aat_gene, stop_tolerant=True ) == True: if cegma_gene.shares_exon_structure_with( thing=aat_gene ) == True: cegma_with_aat_not_gm_es.append(cegma_gene) cegma_aat_nogmes_fh.write("{0}\n".format(cegma_gene.id)) break print("{0} CEGMA genes had no GeneMark-ES match but did have an AAT one".format(len(cegma_with_aat_not_gm_es)) ) ############################################################################## ## now to assemble the results training_ids = list() # 0. Start with genes shared between GeneMark-ES, CEGMA and expression evidence recruit_training_genes( training_ids, genemark_cegma_expression_shared_genes, args.max_genes ) print("DEBUG: {0} genes after recruitment of GeneMark-ES, CEGMA and expression data".format(len(training_ids))) # 1. Pull in the genes with shared evidence across GeneMark-ES, CEGMA and AAT recruit_training_genes( training_ids, genemark_aat_cegma_shared_genes, args.max_genes ) print("DEBUG: {0} genes after recruitment of GeneMark-ES, CEGMA and AAT".format(len(training_ids))) # 2. Next include those genes recruit_training_genes( training_ids, cegma_with_aat_not_gm_es, args.max_genes ) print("DEBUG: {0} genes after recruitment of CEGMA + AAT without GM-ES".format(len(training_ids))) recruit_training_genes( training_ids, genemark_cegma_shared_genes, args.max_genes ) print("DEBUG: {0} genes after recruitment of GeneMark-ES + CEGMA".format(len(training_ids))) recruit_training_genes( training_ids, genemark_aat_shared_genes, args.max_genes ) print("DEBUG: {0} genes after recruitment of GeneMark-ES + AAT".format(len(training_ids))) output_list_fh = open(args.output_file, 'wt') for training_id in training_ids: output_list_fh.write("{0}\n".format(training_id))
def main(): parser = argparse.ArgumentParser( description= 'Checks the CDS features against a genome sequence to report/correct phase columns.' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') type_counts = defaultdict(int) type_lengths = defaultdict(int) assembly_lengths_found = False # key is number of exons, value is number of mRNAs with that many CDS_profile = defaultdict(int) for assembly_id in assemblies: type_counts['assembly'] += 1 if assemblies[assembly_id].length is not None: type_lengths['assembly'] += assemblies[assembly_id].length assembly_lengths_found = True for gene in assemblies[assembly_id].genes(): type_counts['gene'] += 1 type_lengths['gene'] += gene.length for mRNA in gene.mRNAs(): type_counts['mRNA'] += 1 type_lengths['mRNA'] += mRNA.length CDS_profile[mRNA.CDS_count()] += 1 for exon in mRNA.exons(): type_counts['exon'] += 1 type_lengths['exon'] += exon.length for CDS in mRNA.CDSs(): type_counts['CDS fragments'] += 1 type_lengths['CDS fragments'] += CDS.length ofh.write("Assembly count\t{0}\n".format(type_counts['assembly'])) if assembly_lengths_found: ofh.write("Assembly length\t{0}\n".format(type_lengths['assembly'])) else: ofh.write("Assembly length\tN/A (no FASTA data in GFF?)\n") gene_length_mean = type_lengths['gene'] / type_counts['gene'] mRNA_length_mean = type_lengths['mRNA'] / type_counts['mRNA'] exon_length_mean = type_lengths['exon'] / type_counts['exon'] CDS_length_mean = type_lengths['CDS fragments'] / type_counts[ 'CDS fragments'] mRNAs_per_gene_mean = type_counts['mRNA'] / type_counts['gene'] exons_per_mRNA_mean = type_counts['exon'] / type_counts['mRNA'] CDS_per_mRNA_mean = type_counts['CDS fragments'] / type_counts['mRNA'] ofh.write("\nGene count\t{0}\n".format(type_counts['gene'])) ofh.write("Gene length (mean)\t{0:.1f}\n".format(gene_length_mean)) ofh.write("Gene length (sum)\t{0}\n".format(type_lengths['gene'])) ofh.write("\nmRNA count\t{0}\n".format(type_counts['mRNA'])) ofh.write("mRNA length (mean)\t{0:.1f}\n".format(mRNA_length_mean)) ofh.write("mRNA length (sum)\t{0}\n".format(type_lengths['mRNA'])) ofh.write("mRNAs per gene (mean)\t{:.1f}\n".format(mRNAs_per_gene_mean)) ofh.write("\nexon count\t{0}\n".format(type_counts['exon'])) ofh.write("exon length (mean)\t{0:.1f}\n".format(exon_length_mean)) ofh.write("exon length (sum)\t{0}\n".format(type_lengths['exon'])) ofh.write("exons per mRNA (mean)\t{:.1f}\n".format(exons_per_mRNA_mean)) ofh.write("\nCDS count\t{0}\n".format(type_counts['CDS fragments'])) ofh.write("CDS length (mean)\t{0:.1f}\n".format(CDS_length_mean)) ofh.write("CDS fragment length (sum)\t{0}\n".format( type_lengths['CDS fragments'])) ofh.write("CDS per mRNA (mean)\t{:.1f}\n".format(CDS_per_mRNA_mean)) ofh.write("\n# CDS fragment composition profile: count<tab>percentage\n") for cds_count in sorted(CDS_profile): perc = (CDS_profile[cds_count] / type_counts['mRNA']) * 100 ofh.write("mRNAs with {0} CDS\t{1}\t{2:.3}\n".format( cds_count, CDS_profile[cds_count], perc))
def main(): parser = argparse.ArgumentParser( description='Basic comparison of two GFF3 files') ## output file to be written parser.add_argument('-r', '--ref', type=str, required=True, help='Path to the reference GFF3 file') parser.add_argument('-q', '--qry', type=str, required=True, help='Path to the query GFF3 file') parser.add_argument( '-o', '--output_base', type=str, required=True, help='Base name/path of the output files to be created') args = parser.parse_args() (assemblies, ref_features) = biocodegff.get_gff3_features(args.ref) ref_genes = get_genes_from_dict(ref_features) (assemblies, qry_features) = biocodegff.get_gff3_features(args.qry, assemblies=assemblies) qry_genes = get_genes_from_dict(qry_features) ref_matches_found = dict() qry_matches_found = dict() for ref_gene in ref_genes: for qry_gene in qry_genes: if ref_gene.has_same_coordinates_as( thing=qry_gene ) and \ ref_gene.shares_exon_structure_with( thing=qry_gene ) and \ ref_gene.shares_CDS_structure_with( thing=qry_gene ): ref_matches_found[ref_gene.id] = qry_gene.id qry_matches_found[qry_gene.id] = ref_gene.id # open our output files out_matches = open("{0}.matches".format(args.output_base), 'wt') out_summary = open("{0}.summary".format(args.output_base), 'wt') print("INFO: {0}/{1} reference genes had a match to a qry gene".format( len(ref_matches_found), len(ref_genes))) print("INFO: {0}/{1} qry genes had a match to a reference gene".format( len(qry_matches_found), len(qry_genes))) for ref_gene_id in ref_matches_found: out_matches.write("{0}\t{1}\n".format(ref_gene_id, ref_matches_found[ref_gene_id])) out_summary.write("Reference\t{0}\n".format(args.ref)) out_summary.write("Query\t{0}\n".format(args.ref)) out_summary.write( "Total identical models (with respect to reference)\t{0}\n".format( len(ref_matches_found))) out_summary.write("Models in REF not in QRY\t{0}\n".format( len(ref_genes) - len(ref_matches_found))) out_summary.write("Models in QRY not in REF\t{0}\n".format( len(qry_genes) - len(qry_matches_found)))
def main(): parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') type_counts = defaultdict(int) type_lengths = defaultdict(int) assembly_lengths_found = False # key is number of exons, value is number of mRNAs with that many CDS_profile = defaultdict(int) for assembly_id in assemblies: type_counts['assembly'] += 1 if assemblies[assembly_id].length is not None: type_lengths['assembly'] += assemblies[assembly_id].length assembly_lengths_found = True for gene in assemblies[assembly_id].genes(): type_counts['gene'] += 1 type_lengths['gene'] += gene.length for mRNA in gene.mRNAs(): type_counts['mRNA'] += 1 type_lengths['mRNA'] += mRNA.length CDS_profile[mRNA.CDS_count()] += 1 for exon in mRNA.exons(): type_counts['exon'] += 1 type_lengths['exon'] += exon.length for CDS in mRNA.CDSs(): type_counts['CDS fragments'] += 1 type_lengths['CDS fragments'] += CDS.length ofh.write("Assembly count\t{0}\n".format(type_counts['assembly'])) if assembly_lengths_found: ofh.write("Assembly length\t{0}\n".format(type_lengths['assembly'])) else: ofh.write("Assembly length\tN/A (no FASTA data in GFF?)\n") gene_length_mean = type_lengths['gene'] / type_counts['gene'] mRNA_length_mean = type_lengths['mRNA'] / type_counts['mRNA'] exon_length_mean = type_lengths['exon'] / type_counts['exon'] CDS_length_mean = type_lengths['CDS fragments'] / type_counts['CDS fragments'] mRNAs_per_gene_mean = type_counts['mRNA'] / type_counts['gene'] exons_per_mRNA_mean = type_counts['exon'] / type_counts['mRNA'] CDS_per_mRNA_mean = type_counts['CDS fragments'] / type_counts['mRNA'] ofh.write("\nGene count\t{0}\n".format(type_counts['gene'])) ofh.write("Gene length (mean)\t{0:.1f}\n".format(gene_length_mean)) ofh.write("Gene length (sum)\t{0}\n".format(type_lengths['gene'])) ofh.write("\nmRNA count\t{0}\n".format(type_counts['mRNA'])) ofh.write("mRNA length (mean)\t{0:.1f}\n".format(mRNA_length_mean)) ofh.write("mRNA length (sum)\t{0}\n".format(type_lengths['mRNA'])) ofh.write("mRNAs per gene (mean)\t{:.1f}\n".format(mRNAs_per_gene_mean) ) ofh.write("\nexon count\t{0}\n".format(type_counts['exon'])) ofh.write("exon length (mean)\t{0:.1f}\n".format(exon_length_mean)) ofh.write("exon length (sum)\t{0}\n".format(type_lengths['exon'])) ofh.write("exons per mRNA (mean)\t{:.1f}\n".format(exons_per_mRNA_mean) ) ofh.write("\nCDS count\t{0}\n".format(type_counts['CDS fragments'])) ofh.write("CDS length (mean)\t{0:.1f}\n".format(CDS_length_mean)) ofh.write("CDS fragment length (sum)\t{0}\n".format(type_lengths['CDS fragments'])) ofh.write("CDS per mRNA (mean)\t{:.1f}\n".format(CDS_per_mRNA_mean) ) ofh.write("\n# CDS fragment composition profile: count<tab>percentage\n") for cds_count in sorted(CDS_profile): perc = (CDS_profile[cds_count] / type_counts['mRNA']) * 100 ofh.write("mRNAs with {0} CDS\t{1}\t{2:.3}\n".format(cds_count, CDS_profile[cds_count], perc) )
def process_files(args): (assemblies_1, features_1) = biocodegff.get_gff3_features(args.annotation_1) (assemblies_2, features_2) = biocodegff.get_gff3_features(args.annotation_2) a_exons = [ ] ## Set contains only uniq exons from known annotation, since multiple same exons can appear in a gff file. p_exons = [] ## For predicted annotation a_gene = [] p_gene = [] a_mrna = [] p_mrna = [] exon_pred_all = set() gene_true = set() mrna_true = set() chr = [] a_cds = [] p_cds = [] a_cd = [] p_cd = [] chr = [] true_pred_file = args.output_dir + '/true_predicted_genes.txt' true_file = open(true_pred_file, 'w') true_file.write("Known\tPredicted\n") for asm_id in assemblies_1: ## Iterate through each chromosome from the known ref annotation assembly_1 = assemblies_1[asm_id] assembly_2 = assemblies_2.get( asm_id, -1) ## Find that chromosome in the predicted gff file genes_1 = assembly_1.genes() ## All genes from known annotation anno_exons = set() for gene_1 in sorted( genes_1 ): ## Add unique gene, mrna , exon features from known annotation to get each known feature total count gene_1_loc = gene_1.location_on(assembly_1) cord_a = cordinate( asm_id, gene_1_loc ) ## Use chromosome id+start+stop+strand as a string to determine uniqueness. if (cord_a not in a_gene): a_gene.append(cord_a) ex_start = [] ex_stop = [] for mrna_1 in sorted(gene_1.mRNAs()): mrna_1_loc = mrna_1.location_on(assembly_1) cord = cordinate(asm_id, mrna_1_loc) if (cord not in a_mrna): a_mrna.append(cord) if (args.feature == "Exon"): feat_1 = mrna_1.exons() if (args.feature == "CDS"): feat_1 = mrna_1.CDSs() for exon_1 in sorted(feat_1): exon_1_loc = exon_1.location_on(assembly_1) cord = cordinate(asm_id, exon_1_loc) if (cord not in a_exons): a_exons.append(cord) anno_exons.add(cord) ex_start.append(exon_1_loc.fmin) ex_stop.append(exon_1_loc.fmax) ex_start.sort() ex_stop.sort() if (len(ex_start) >= 1): cds1 = asm_id + ":" + gene_1.id + ":" + str( ex_start[0]) + ":" + str(ex_stop[-1]) + ":" + str( gene_1_loc.strand) else: cds1 = asm_id + ":" + gene_1.id + ":" + str( gene_1_loc.fmin) + ":" + str(gene_1_loc.fmax) + ":" + str( gene_1_loc.strand) if (cord_a not in a_cd): a_cds.append(cds1) a_cd.append(cord_a) if ( type(assembly_2) is int ): ## If the chromosome is not found in prediected file, move to next chromosome. continue genes_2 = assembly_2.genes() ## All genes from predicted annotation. chr.append(asm_id) ## Append all found chromosome in a list. pred_exons = set() for gene_2 in sorted( genes_2 ): ## Add unique gene, mrna , exon features from predicted annotation to get each predicted feature total count. gene_2_loc = gene_2.location_on(assembly_2) cord_p = cordinate(asm_id, gene_2_loc) if (cord_p not in p_gene): p_gene.append(cord_p) ex_start = [] ex_stop = [] for mrna_2 in sorted(gene_2.mRNAs()): mrna_2_loc = mrna_2.location_on(assembly_2) cord = cordinate(asm_id, mrna_2_loc) if (cord not in p_mrna): p_mrna.append(cord) if (args.feature == "Exon"): feat_2 = mrna_2.exons() if (args.feature == "CDS"): feat_2 = mrna_2.CDSs() for exon_2 in sorted(feat_2): exon_2_loc = exon_2.location_on(assembly_2) cord = cordinate(asm_id, exon_2_loc) pred_exons.add(cord) if (cord not in p_exons): p_exons.append(cord) ex_start.append(exon_2_loc.fmin) ex_stop.append(exon_2_loc.fmax) ex_start.sort() ex_stop.sort() if (len(ex_start) >= 1): cds2 = asm_id + ":" + gene_2.id + ":" + str( ex_start[0]) + ":" + str(ex_stop[-1]) + ":" + str( gene_2_loc.strand) else: cds2 = asm_id + ":" + gene_2.id + ":" + str( gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" + str( gene_2_loc.strand) if (cord_p not in p_cd): p_cds.append(cds2) p_cd.append(cord_p) exon_pred_all.update(pred_exons.intersection(anno_exons)) # true exons for gene_2 in sorted( genes_2 ): ## From the predicted feature determine the true once. Iterate through each predicted gene sorted by cordinate gene_2_loc = gene_2.location_on(assembly_2) cord_g = cordinate(asm_id, gene_2_loc) if ( cord_g in gene_true ): ## To prevent duplication, check if the feature already exists in the set of truly predicted gene. continue ex_mrna2 = set() for gene_1 in sorted(genes_1): ex_mrna1 = set() gene_1_loc = gene_1.location_on(assembly_1) if (gene_1_loc.strand != gene_2_loc.strand): continue if (gene_2.overlaps_with(gene_1)): for mrna_2 in sorted(gene_2.mRNAs()): if (args.feature == "Exon"): feat_2 = mrna_2.exons() if (args.feature == "CDS"): feat_2 = mrna_2.CDSs() for exon_2 in sorted(feat_2): exon_2_loc = exon_2.location_on(assembly_2) cord2 = cordinate(asm_id, exon_2_loc) ex_mrna2.add(cord2) for mrna_1 in sorted(gene_1.mRNAs()): if (args.feature == "Exon"): feat_1 = mrna_1.exons() if (args.feature == "CDS"): feat_1 = mrna_1.CDSs() for exon_1 in sorted(feat_1): exon_1_loc = exon_1.location_on(assembly_1) cord1 = cordinate(asm_id, exon_1_loc) ex_mrna1.add(cord1) ex_union = ex_mrna1.union(ex_mrna2) if (len(ex_union) == len(ex_mrna1) and len(ex_union) == len(ex_mrna2)): gene_true.add(cord_g) true_file.write(gene_1.id + "\t" + gene_2.id + "\n") break for asm_id in assemblies_2: ## Iterate through each chromosome from the predicted annotation if asm_id not in chr: assembly_2 = assemblies_2.get( asm_id, -1 ) ## Find that chromosome in the predicted gff file which is not found in known annotation genes_2 = assembly_2.genes( ) ## Add genes, mrna, exon features from predicted annotation to total predicted feature set. for gene_2 in sorted(genes_2): gene_2_loc = gene_2.location_on(assembly_2) cord_p = cordinate(asm_id, gene_2_loc) if (cord_p not in p_gene): p_gene.append(cord_p) ex_start = [] ex_stop = [] for mrna_2 in sorted(gene_2.mRNAs()): mrna_2_loc = mrna_2.location_on(assembly_2) cord = cordinate(asm_id, mrna_2_loc) if (cord not in p_mrna): p_mrna.append(cord) if (args.feature == "Exon"): feat_2 = mrna_2.exons() if (args.feature == "CDS"): feat_2 = mrna_2.CDSs() for exon_2 in sorted(feat_2): exon_2_loc = exon_2.location_on(assembly_2) cord = cordinate(asm_id, exon_2_loc) if (cord not in p_exons): p_exons.append(cord) ex_start.append(exon_2_loc.fmin) ex_stop.append(exon_2_loc.fmax) ex_start.sort() ex_stop.sort() if (len(ex_start) >= 1): cds2 = asm_id + ":" + gene_2.id + ":" + str( ex_start[0]) + ":" + str(ex_stop[-1]) + ":" + str( gene_2_loc.strand) else: cds2 = asm_id + ":" + gene_2.id + ":" + str( gene_2_loc.fmin) + ":" + str( gene_2_loc.fmax) + ":" + str(gene_2_loc.strand) if (cord_p not in p_cd): p_cds.append(cds2) p_cd.append(cord_p) #Calculate SN/SP for bases (a_base_val, p_base_val, true_base) = base_comparison(p_exons, a_exons) base_sn = (true_base / a_base_val) * 100 base_sp = (true_base / p_base_val) * 100 #Calculate SN/SP for exons annotated_exon = len(a_exons) predicted_exon = len(p_exons) true_pred_exon = len(exon_pred_all) exon_sn = (true_pred_exon / annotated_exon) * 100 exon_sp = (true_pred_exon / predicted_exon) * 100 #Calculate SN/SP for genes annotated_gene = len(a_gene) predicted_gene = len(p_gene) true_pred_gene = len(gene_true) gene_sn = (true_pred_gene / annotated_gene) * 100 gene_sp = (true_pred_gene / predicted_gene) * 100 print("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n") print("Gene\t" + str(annotated_gene) + "\t" + str(predicted_gene) + "\t" + str(true_pred_gene) + "\t" + str(gene_sn) + "\t" + str(gene_sp)) print(args.feature + "\t" + str(annotated_exon) + "\t" + str(predicted_exon) + "\t" + str(true_pred_exon) + "\t" + str(exon_sn) + "\t" + str(exon_sp)) print("Base\t" + str(a_base_val) + "\t" + str(p_base_val) + "\t" + str(true_base) + "\t" + str(base_sn) + "\t" + str(base_sp)) out_file = args.output_dir + '/summary.txt' if not (os.path.exists(args.output_dir)): sys.exit("Directory does not exist.") fout = open(out_file, 'w') fout.write("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n") fout.write("Gene\t" + str(annotated_gene) + "\t" + str(predicted_gene) + "\t" + str(true_pred_gene) + "\t" + str(gene_sn) + "\t" + str(gene_sp) + "\n") fout.write(args.feature + "\t" + str(annotated_exon) + "\t" + str(predicted_exon) + "\t" + str(true_pred_exon) + "\t" + str(exon_sn) + "\t" + str(exon_sp) + "\n") fout.write("Base\t" + str(a_base_val) + "\t" + str(p_base_val) + "\t" + str(true_base) + "\t" + str(base_sn) + "\t" + str(base_sp) + "\n\n") arr_pred = compare_cds(p_cds, a_cds, "pred") arr_known = compare_cds(a_cds, p_cds, "known") arr_pred_same = compare_cds(p_cds, p_cds, "pred_same") new_gene = arr_pred[2] gene_merge = arr_pred[3] gene_found = arr_pred[0] gene_opp = arr_pred[1] gene_missing = arr_known[2] gene = arr_known[0] gene_opp_known = arr_known[1] gene_split = arr_known[3] gene_pred_overlap_opp = arr_pred_same[1] print("1. No. of known gene : ", len(a_cds)) print("2. No. of predicted gene : ", len(p_cds)) print("3. No. of predicted gene overlapping 0 known gene (new gene): ", new_gene) print( "4. No. of predicted gene overlapping > 1 known gene (gene merge) : ", gene_merge) print("5. No. of predicted gene overlaping 1 known gene : ", gene_found) print( "6. No. of predicted gene overlapping >= 1 known gene in opp strand : ", gene_opp) print( "7. No. of predicted gene overlapping 1 known gene (exact intron/exon boundaries) : ", true_pred_gene) print( "8. No. of predicted gene overlapping >= 1 predicted gene in opp strand : ", gene_pred_overlap_opp) print( "9. No. of known gene overlapping 0 predicted gene (gene missing): ", gene_missing) print( "10. No. of known gene overlapping > 1 predicted gene(gene split) : ", gene_split) print("11. No. of known gene overlaping 1 predicted gene : ", gene) print( "12. No. of known gene overlapping >= 1 predicted gene in opp strand : ", gene_opp_known) out_file = args.output_dir + '/final_stats.txt' if not (os.path.exists(args.output_dir)): sys.exit("Directory does not exist.") fout = open(out_file, 'w') fout.write("1. No. of known gene : " + str(len(a_cds)) + "\n") fout.write("2. No. of predicted gene : " + str(len(p_cds)) + "\n") fout.write( "3. No. of predicted gene overlapping 0 known gene (new gene): " + str(new_gene) + "\n") fout.write( "4. No. of predicted gene overlapping > 1 known gene (gene merge) : " + str(gene_merge) + "\n") fout.write("5. No. of predicted gene overlaping 1 known gene : " + str(gene_found) + "\n") fout.write( "6. No. of predicted gene overlapping >= 1 known gene in opp strand : " + str(gene_opp) + "\n") fout.write( "7. No. of predicted gene overlapping 1 known gene (exact intron/exon boundary) : " + str(true_pred_gene) + "\n") fout.write( "8. No. of predicted gene overlapping >= 1 predicted gene in opp strand : " + str(gene_pred_overlap_opp) + "\n") fout.write( "9. No. of known gene overlapping 0 predicted gene (gene missing): " + str(gene_missing) + "\n") fout.write( "10. No. of known gene overlapping > 1 predicted gene (gene_split): " + str(gene_split) + "\n") fout.write("11. No. of known gene overlaping 1 predicted gene : " + str(gene) + "\n") fout.write( "12. No. of known gene overlapping >= 1 predicted gene in opp strand : " + str(gene_opp_known) + "\n") true_pred_file = args.output_dir + '/true_pred.txt' fout_true = open(true_pred_file, 'w') for true_gene in gene_true: fout_true.write(true_gene + "\n") #Clean up delete_file = [ 'exon_1.bed', 'exon_2.bed', 'exon_1_merged.bed', 'exon_2_merged.bed', 'exon_1_2_intersect.bed' ] for f in delete_file: cmd = "rm " + args.output_dir + "/" + f os.system(cmd)
def main(): parser = argparse.ArgumentParser( description='Converts GFF3 into a GenBank flat file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to a Genbank flat file to be created. Supersedes --output_dir if both are specified.' ) parser.add_argument('-od', '--output_dir', type=str, required=False, help='Path to an output directory. If this option is specified then each input assembly will be written to a separate GenBank output file, named with the assembly_id.' ) parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF') parser.add_argument('-mt', '--molecule_type', type=str, required=False, default='DNA', help='Molecule type' ) parser.add_argument('-gbd', '--genbank_division', type=str, required=False, default='.', help='GenBank Division (3-letter abbreviation)' ) parser.add_argument('-md', '--modification_date', type=str, required=False, default='DD-MMM-YYYY', help='The modification date for header in format like 21-JUN-1999' ) parser.add_argument('-org', '--organism', type=str, required=False, default='.', help='Full organism name (including strain)' ) parser.add_argument('-str', '--strain', type=str, required=False, help="Only the strain designation, which is written to the FEATURES.source element" ) parser.add_argument('-d', '--definition', type=str, required=False, default='.', help='Brief description of sequence; includes information such as source organism, gene name/protein name, or some description of the sequence\'s function.' ) parser.add_argument('-s', '--source', type=str, required=False, default='.', help='Free-format information including an abbreviated form of the organism name, sometimes followed by a molecule type.' ) parser.add_argument('-t', '--taxon_id', type=int, required=False, help='NCBI taxon ID, if known' ) parser.add_argument('-l', '--lineage', type=str, required=False, default='Unknown', help='Semicolon-delimited lineage of the organism e.g., "Eukaryota; Alveolata; Apicomplexa; Aconoidasida; Piroplasmida; Theileriidae; Theileria"' ) parser.add_argument('-seq', '--include_sequence', action='store_true', help='Include sequence (if present) in the output GenBank flat file(s).' ) parser.add_argument('-p', '--locus_id_prefix', required=False, default='', help='Prefix to add to the GenBank LOCUS id in the output GenBank flat file(s).' ) args = parser.parse_args() # check that output directory exists if args.output_dir is not None: if not os.path.isdir(args.output_dir): sys.stderr.write("FATAL: the specified output directory (" + args.output_dir + ") does not exist\n"); exit(1) # line-wrap lineage to stay below 79 character GenBank flat file width lineage = biocodegenbank.line_wrap_lineage_string( args.lineage ) (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) ofh = sys.stdout if args.output_file is not None: if args.output_dir is None: ofh = open(args.output_file, 'wt') else: sys.stderr.write("WARN: both -o/--output_file and -od/--output_dir were passed so the former will be ignored\n") # deal with the FASTA file if the user passed one if args.genome_fasta is not None: process_assembly_fasta(assemblies, args.genome_fasta) for assembly_id in assemblies: locus_id = args.locus_id_prefix + assembly_id if args.output_dir is not None: ofn = args.output_dir + "/" + locus_id + ".gbk" ofh = open(ofn, 'wt') assembly = assemblies[assembly_id] context = { 'locus':locus_id, 'molecule_size':assembly.length, 'molecule_type':args.molecule_type, 'division':args.genbank_division, 'modification_date':args.modification_date, 'accession':'.', 'version':'.', 'source':args.source, 'definition':args.definition, 'organism':args.organism, 'lineage':lineage } header = TEMPLATE_ENVIRONMENT.get_template('genbank_flat_file_header.template').render(context) ofh.write(header) ofh.write("\nFEATURES Location/Qualifiers\n") ofh.write(" source 1..{0}\n".format(assembly.length)) ofh.write(" /organism=\"{0}\"\n".format(args.organism)) ofh.write(" /mol_type=\"genomic DNA\"\n") if args.strain is not None: ofh.write(" /strain=\"{0}\"\n".format(args.strain)) if args.taxon_id is not None: ofh.write(" /db_xref=\"taxon:{0}\"\n".format(args.taxon_id)) for gene in assemblies[assembly_id].genes(): biocodegenbank.print_biogene( gene=gene, fh=ofh, on=assembly ) if args.include_sequence: ofh.write("ORIGIN\n") biocodegenbank.print_sequence( seq=assembly.residues, fh=ofh ) ofh.write("//\n") # there may be multiple output files if args.output_dir is not None: ofh.close() # there is only one output file if args.output_dir is None: ofh.close()
def main(): parser = argparse.ArgumentParser( description='Provides coverage information for features in a GFF3 file') ## output file to be written parser.add_argument('evidence_files', metavar='N', type=str, nargs='+', help='Path to one or more evidence files, separated by spaces' ) parser.add_argument('-r', '--reference', type=str, required=True, help='Input path to the reference GFF3 file. So we know what feature type to report on, format should be like FILE:TYPE' ) parser.add_argument('-f', '--fasta', type=str, required=True, help='Input path to the reference FASTA file.' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional path to an output file to be created, else prints on STDOUT' ) args = parser.parse_args() ## parse the fasta fasta = biocodeutils.fasta_dict_from_file(args.fasta) ## open the output file fout = None if args.output_file is None: fout = codecs.getwriter('utf8')(sys.stdout.buffer) else: fout = open(args.output_file, "w") #################################################### ## Sanity checks allowed_extensions = ['bed', 'gff3', 'pileup', 'sam'] for ev_file in args.evidence_files: valid_ext_found = False for ext in allowed_extensions: if ev_file.endswith(ext): valid_ext_found = True if valid_ext_found == False: raise Exception("ERROR: Evidence file passed with unsupported file extension: {0}. Supported extensions are {1}".format(ev_file, allowed_extensions)) ## The input file should be defined as $path:$feattype if ':' not in args.reference: raise Exception("ERROR: input_file must be like /path/to/some.gff3:mRNA") ref_file_parts = args.reference.split(':') print("DEBUG: part count: {0}".format(len(ref_file_parts))) if ref_file_parts[0].endswith('.gff3'): (ref_assemblies, ref_features) = biocodegff.get_gff3_features( ref_file_parts[0] ) else: raise Exception("ERROR: Expected input file (-i) to have a gff3 extension, got {0}".format(ref_file_parts[0])) #################################################### ## Initialize the coverage arrays fasta_cov = dict() for seq_id in fasta: # create a list of 0s the length of the molecule fasta_cov[seq_id] = [0] * len(fasta[seq_id]['s']) #################################################### ## Now parse the evidence files for ev_file in args.evidence_files: if ev_file.endswith('pileup'): parse_pileup(fasta_cov, ev_file) elif ev_file.endswith('sam'): parse_sam(fasta_cov, ev_file) else: print("INFO: ignoring evidence file {0} because code to handle its file type isn't currently implemented".format(ev_file)) for id in fasta_cov: covered_bases = 0 for i in fasta_cov[id]: if fasta_cov[id][i] > 0: covered_bases += 1 fout.write("{0}\t{1}\t{2}\n".format(id, len(fasta[id]['s']), covered_bases))
def main(): parser = argparse.ArgumentParser( description= 'Checks the CDS features against a genome sequence to report/correct phase columns.' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-o', '--output_gff', type=str, required=False, help= 'Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_terminal_stops = 0 stop_codons = ['TAG', 'TAA', 'TGA'] for assembly_id in assemblies: print("Assembly {0} has length {1}".format( assembly_id, assemblies[assembly_id].length)) for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 translation = biocodeutils.translate(coding_seq) if translation.endswith('*'): mRNAs_with_terminal_stops += 1 else: print("gene:{1}, mRNA: {0} is missing a stop".format( mRNA.id, gene.id)) mRNA_loc = mRNA.location_on(assemblies[assembly_id]) CDSs = sorted(mRNA.CDSs()) codon_step_size = 3 if mRNA_loc.strand == 1: CDS_pos = CDSs[-1].location_on( assemblies[assembly_id]).fmax mRNA_limit = mRNA_loc.fmax else: CDS_pos = CDSs[0].location_on( assemblies[assembly_id]).fmin mRNA_limit = mRNA_loc.fmin codon_step_size = -3 print("\tmRNA:{0}-{1}, CDS end: {2}\n\tExtending".format( mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos), end='') new_stop_found = False # We have to step backwards to start if on the reverse strand if codon_step_size < 0: CDS_pos += codon_step_size while True: if (codon_step_size < 0 and CDS_pos < mRNA_limit) or ( codon_step_size > 0 and CDS_pos > mRNA_limit): print(" Reached the mRNA limit") break else: next_codon = assemblies[assembly_id].residues[ CDS_pos:CDS_pos + 3] print(".{0}({1})".format(next_codon, CDS_pos), end='') if next_codon in stop_codons: new_stop_found = True print(" Found a stop") break CDS_pos += codon_step_size if new_stop_found == True: print("\tCDS_pos: UPDATE: {0}".format(CDS_pos)) else: print("\tCDS_pos: SAME: {0}".format(CDS_pos)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs with terminal stops: {0}".format(mRNAs_with_terminal_stops))
def main(): bin_dir = os.path.abspath(os.path.dirname(__file__)) test_gff_file = bin_dir + '/biothings_coordinate_comparisons.data' (assemblies, features) = biocodegff.get_gff3_features( test_gff_file ) ########################################################################################### if features['TP03_0010'] < features['TP03_0012.t01_polypeptide']: print("INFO: < positive check successful") else: print("ERROR: < check unsuccessful") if features['TP03_0012'] < features['TP03_0012.t01_polypeptide']: print("ERROR: < check unsuccessful") else: print("INFO: < negative check successful") ########################################################################################### if features['TP03_0012'] > features['TP03_0010']: print("INFO: > positive check successful") else: print("ERROR: > check unsuccessful") if features['TP03_0010'] > features['TP03_0012.t01_polypeptide']: print("ERROR: > check unsuccessful") else: print("INFO: > negative check successful") ########################################################################################### if features['TP03_0012.t01_exon-auto15079'] <= features['TP03_0012.t01_polypeptide']: print("INFO: <= positive check successful") else: print("ERROR: <= check unsuccessful") if features['TP03_0010'] <= features['TP03_0012']: print("ERROR: <= check unsuccessful") else: print("INFO: <= negative check successful") ########################################################################################### if features['TP03_0012.t01_exon-auto15085'] >= features['TP03_0012.t01_polypeptide']: print("INFO: >= positive check successful") else: print("ERROR: >= check unsuccessful") if features['TP03_0010'] >= features['TP03_0012']: print("ERROR: >= check unsuccessful") else: print("INFO: >= negative check successful") ########################################################################################### if features['TP03_0012.t01_exon-auto15079'].overlaps_with(features['TP03_0012.t01_polypeptide']): print("INFO: overlaps_with() positive check successful") else: print("ERROR: overlaps_with() positive check unsuccessful") if features['TP03_0002'].overlaps_with(features['TP03_0010']): print("ERROR: overlaps_with() negative check unsuccessful") else: print("INFO: overlaps_with() negative check successful") ########################################################################################### overlap_size = features['TP03_0012.t01_polypeptide'].overlap_size_with(features['TP03_0012.t01_CDS-auto15079']) if overlap_size == 224: print("INFO: overlap_size_with() positive check successful") else: print("ERROR: overlap_size_with() positive check unsuccessful (overlap returned: {0})".format(overlap_size)) if features['TP03_0012.t01_polypeptide'].overlap_size_with(features['TP03_0012.t01_CDS-auto15085']) == 224: print("INFO: overlap_size_with() negative check unsuccessful") else: print("ERROR: overlap_size_with() negative check successful")
def process_files(args): (assemblies_1, features_1) = biocodegff.get_gff3_features(args.annotation_1) (assemblies_2, features_2) = biocodegff.get_gff3_features(args.annotation_2) a_exons = [] ## Set contains only uniq exons from known annotation, since multiple same exons can appear in a gff file. p_exons = [] ## For predicted annotation a_gene = [] p_gene = [] a_mrna = [] p_mrna = [] exon_pred_all = [] gene_true = set() mrna_true = set() a_base = 0 p_base = 0 true_base = 0 chr = [] for asm_id in assemblies_1: ## Iterate through each chromosome from the known ref annotation assembly_1 = assemblies_1[asm_id] assembly_2 = assemblies_2.get(asm_id,-1) ## Find that chromosome in the predicted gff file genes_1 = assembly_1.genes() ## All genes from known annotation anno_exons = set() for gene_1 in sorted(genes_1) : ## Add unique gene, mrna , exon features from known annotation to get each known feature total count gene_1_loc = gene_1.location_on(assembly_1) cord = asm_id + ":" + str(gene_1_loc.fmin) + ":" + str(gene_1_loc.fmax)+ ":" + str(gene_1_loc.strand) ## Use chromosome id+start+stop+strand as a string to determine uniqueness. if (cord not in a_gene) : a_gene.append(cord) for mrna_1 in sorted(gene_1.mRNAs()) : mrna_1_loc = mrna_1.location_on(assembly_1) cord = asm_id + ":" + str(mrna_1_loc.fmin) + ":" + str(mrna_1_loc.fmax) + ":" + str(mrna_1_loc.strand) if (cord not in a_mrna) : a_mrna.append(cord) for exon_1 in sorted(mrna_1.exons()) : exon_1_loc = exon_1.location_on(assembly_1) cord = asm_id + ":" + str(exon_1_loc.fmin) + ":" + str(exon_1_loc.fmax) + ":" + str(exon_1_loc.strand) if (cord not in a_exons) : a_exons.append(cord) anno_exons.add(cord) if (type(assembly_2) is int) : ## If the chromosome is not found in prediected file, move to next chromosome. continue genes_2 = assembly_2.genes() ## All genes from predicted annotation. chr.append(asm_id) ## Append all found chromosome in a list. pred_exons = set() for gene_2 in sorted(genes_2) : ## Add unique gene, mrna , exon features from predicted annotation to get each predicted feature total count. gene_2_loc = gene_2.location_on(assembly_2) cord = asm_id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" + str(gene_2_loc.strand) if (cord not in p_gene) : p_gene.append(cord) for mrna_2 in sorted(gene_2.mRNAs()) : mrna_2_loc = mrna_2.location_on(assembly_2) cord = asm_id + ":" + str(mrna_2_loc.fmin) + ":" + str(mrna_2_loc.fmax)+ ":" + str(mrna_2_loc.strand) if (cord not in p_mrna) : p_mrna.append(cord) for exon_2 in sorted(mrna_2.exons()) : exon_2_loc = exon_2.location_on(assembly_2) cord = asm_id + ":" + str(exon_2_loc.fmin) + ":" + str(exon_2_loc.fmax)+ ":" + str(exon_2_loc.strand) pred_exons.add(cord) if (cord not in p_exons) : p_exons.append(cord) for exon_2 in pred_exons : ##Identify true exons. for exon_1 in anno_exons : if (exon_1 == exon_2) : if (exon_2 not in exon_pred_all) : exon_pred_all.append(exon_2) break for gene_2 in sorted(genes_2) : ## From the predicted feature determine the true once. Iterate through each predicted gene sorted by cordinate gene_2_loc = gene_2.location_on(assembly_2) cord_g = asm_id + ":"+ str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":" + str(gene_2_loc.strand) if (cord_g in gene_true) : ## To prevent duplication, check if the feature already exists in the set of truly predicted gene. continue true_pred_mrna_per_gene = 0 for mrna_2 in sorted(gene_2.mRNAs()) : ## Iterate through each predicted mrna , if all of its exon is true , then the predicted mRNA is true mrna_2_loc = mrna_2.location_on(assembly_2) cord_m = asm_id + ":" + str(mrna_2_loc.fmin) + ":" + str(mrna_2_loc.fmax) + ":" + str(mrna_2_loc.strand) if (cord_m in mrna_true) : continue count = 0 pred_exon = set() for exon_2 in sorted(mrna_2.exons()) : exon_2_loc = exon_2.location_on(assembly_2) cord = asm_id + ":" + str(exon_2_loc.fmin) + ":" + str(exon_2_loc.fmax) + ":" + str(exon_2_loc.strand) if cord in pred_exon : continue pred_exon.add(cord) for true_exon in exon_pred_all : if (cord == true_exon) : count += 1 break if (len(pred_exon) == count) : mrna_true.add(cord_m) true_pred_mrna_per_gene += 1 if (true_pred_mrna_per_gene >= 1) : ## If the predicted gene has atleast one true predicted mrna, then the gene is true. gene_true.add(cord_g) for asm_id in assemblies_2: ## Iterate through each chromosome from the predicted annotation if asm_id not in chr : assembly_2 = assemblies_2.get(asm_id,-1) ## Find that chromosome in the predicted gff file which is not found in known annotation genes_2 = assembly_2.genes() ## Add genes, mrna, exon features from predicted annotation to total predicted feature set. for gene_2 in sorted(genes_2) : gene_2_loc = gene_2.location_on(assembly_2) cord = asm_id + ":" + str(gene_2_loc.fmin) + ":" + str(gene_2_loc.fmax) + ":"+ str(gene_2_loc.strand) if (cord not in p_gene) : p_gene.append(cord) for mrna_2 in sorted(gene_2.mRNAs()) : mrna_2_loc = mrna_2.location_on(assembly_2) cord = asm_id + ":" + str(mrna_2_loc.fmin) + ":" + str(mrna_2_loc.fmax) + ":" + str(mrna_2_loc.strand) if (cord not in p_mrna) : p_mrna.append(cord) for exon_2 in sorted(mrna_2.exons()) : exon_2_loc = exon_2.location_on(assembly_2) cord = asm_id + ":" + str(exon_2_loc.fmin) + ":" + str(exon_2_loc.fmax) + ":" + str(exon_2_loc.strand) if (cord not in p_exons) : p_exons.append(cord) exon2_bed = args.output_dir + '/exon_2.bed' e_bed = open(exon2_bed, 'w') for exon in p_exons : chrom = (exon.split(':'))[0] start = int((exon.split(':'))[1]) stop = int((exon.split(':'))[2]) strand = (exon.split(':'))[3] if (strand == str(1)) : strand = "+" else : strand = "-" e_bed.write(chrom+"\t"+str(start)+"\t"+str(stop)+"\texon\t"+str(0)+"\t"+strand+"\n") e_bed.close() out2 = args.output_dir + '/exon_2_merged.bed' cmd = "bedtools merge -nms -scores sum -i " + exon2_bed + " -s >"+out2 print(cmd) os.system(cmd) exon1_bed = args.output_dir + '/exon_1.bed' e_bed = open(exon1_bed, 'w') for exon in a_exons : chrom = (exon.split(':'))[0] start = int((exon.split(':'))[1]) stop = int((exon.split(':'))[2]) strand = (exon.split(':'))[3] if (strand == str(1)) : strand = "+" else : strand = "-" e_bed.write(chrom+"\t"+str(start)+"\t"+str(stop)+"\texon\t"+str(0)+"\t"+strand+"\n") e_bed.close() out1 = args.output_dir + '/exon_1_merged.bed' cmd = "bedtools merge -nms -scores sum -i " + exon1_bed + " -s >"+out1 print(cmd) os.system(cmd) out_intersect = args.output_dir + '/exon_1_2_intersect.bed' cmd = "bedtools intersect -s -wo -a " + out1 + " -b " + out2 + " >" + out_intersect print(cmd) os.system(cmd) a_base_file = open(out1,'r') for line in a_base_file : arr = line.split("\t") a_base = a_base + (int(arr[2]) - int(arr[1])) a_base_file.close() p_base_file = open(out2,'r') for line in p_base_file : arr = line.split("\t") p_base = p_base + (int(arr[2]) - int(arr[1])) p_base_file.close() true_base_file = open(out_intersect,'r') for line in true_base_file : arr = line.split("\t") true_base = true_base + int(arr[12]) true_base_file.close() #Calculate SN/SP for bases base_sn = (true_base/a_base) * 100 base_sp = (true_base/p_base) * 100 #Calculate SN/SP for exons annotated_exon = len(a_exons) predicted_exon = len(p_exons) true_pred_exon = len(exon_pred_all) exon_sn = (true_pred_exon/annotated_exon) * 100 exon_sp = (true_pred_exon/predicted_exon) * 100 #Calculate SN/SP for transcript annotated_mrna = len(a_mrna) predicted_mrna = len(p_mrna) true_pred_mrna = len(mrna_true) mrna_sn = (true_pred_mrna/annotated_mrna) * 100 mrna_sp = (true_pred_mrna/predicted_mrna) * 100 #Calculate SN/SP for genes annotated_gene = len(a_gene) predicted_gene = len(p_gene) true_pred_gene = len(gene_true) gene_sn = (true_pred_gene/annotated_gene) * 100 gene_sp = (true_pred_gene/predicted_gene) * 100 print("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n") print("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp)) print("mRNA\t"+str(annotated_mrna)+"\t"+str(predicted_mrna)+"\t"+str(true_pred_mrna)+"\t"+str(mrna_sn)+"\t"+str(mrna_sp)) print("Exon\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp)) print("Base\t"+str(a_base)+"\t"+str(p_base)+"\t"+str(true_base)+"\t"+str(base_sn)+"\t"+str(base_sp)) out_file = args.output_dir + '/summary.txt' if not (os.path.exists(args.output_dir)) : sys.exit("Directory does not exist.") fout = open(out_file,'w') fout.write("Feature\tKnown\tPredicted\tTrue_Predicted\tSN\tPPV\n") fout.write("Gene\t"+str(annotated_gene)+"\t"+str(predicted_gene)+"\t"+str(true_pred_gene)+"\t"+str(gene_sn)+"\t"+str(gene_sp)+"\n") fout.write("mRNA\t"+str(annotated_mrna)+"\t"+str(predicted_mrna)+"\t"+str(true_pred_mrna)+"\t"+str(mrna_sn)+"\t"+str(mrna_sp)+"\n") fout.write("Exon\t"+str(annotated_exon)+"\t"+str(predicted_exon)+"\t"+str(true_pred_exon)+"\t"+str(exon_sn)+"\t"+str(exon_sp)+"\n") new_gene = 0 gene_merge = 0 gene_found = 0 gene_split = 0 gene_missing = 0 gene = 0 for gene2 in p_gene : gene_overlap = [] chrom2 = (gene2.split(':'))[0] start2 = int((gene2.split(':'))[1]) stop2 = int((gene2.split(':'))[2]) strand2 = (gene2.split(':'))[3] for gene1 in a_gene: chrom1 = (gene1.split(':'))[0] start1 = int((gene1.split(':'))[1]) stop1 = int((gene1.split(':'))[2]) strand1 = (gene1.split(':'))[3] if (chrom1 != chrom2) : continue if (strand1 != strand2) : continue if (start1 > stop2) : break if(start1 <= stop2 and start2 <= stop1) : arr = [start1,stop1,start2,stop2] arr.sort() len_overlap = arr[2] - arr[1] gene_overlap.append(len_overlap) if (len(gene_overlap) == 0) : new_gene += 1 if (len(gene_overlap) > 1) : gene_merge += 1 if (len(gene_overlap) == 1) : gene_found += 1 for gene1 in a_gene : gene_overlap = [] chrom1 = (gene1.split(':'))[0] start1 = int((gene1.split(':'))[1]) stop1 = int((gene1.split(':'))[2]) strand1 = (gene1.split(':'))[3] for gene2 in p_gene: chrom2 = (gene2.split(':'))[0] start2 = int((gene2.split(':'))[1]) stop2 = int((gene2.split(':'))[2]) strand2 = (gene2.split(':'))[3] if (chrom1 != chrom2) : continue if (strand1 != strand2) : continue if (start2 > stop1) : break if(start1 <= stop2 and start2 <= stop1) : arr = [start1,stop1,start2,stop2] arr.sort() len_overlap = arr[2] - arr[1] gene_overlap.append(len_overlap) if (len(gene_overlap) > 1) : gene_split += 1 if (len(gene_overlap) == 1) : gene += 1 if (len(gene_overlap) == 0) : gene_missing += 1 print ("No. of predicted gene overlapping 0 known gene (new gene): ",new_gene) print ("No. of predicted gene overlapping > 1 known gene: ",gene_merge) print ("No. of predicted gene overlaping 1 known gene : ",gene_found) print ("No. of known gene overlapping > 1 predicted gene : ",gene_split) print ("No. of known gene overlapping 1 predicted gene : ",gene) print ("No. of known gene overlapping 0 predicted gene (gene missing) : ",gene_missing)
def main(): ''' This script reports statistics on the areas of a genome where features aren't - introns and intergenic space. Pass a valid GFF3 file (along with FASTA data) and get a report like this: Molecule count: 9 Gene count: 4171 Intergenic space count: 4061 Average intergenic space distance: 361.7 bp Median intergenic space distance: 245 bp Minimum intergenic space distance: 0 bp Maximum intergenic space distance: 6272 bp Intron count: 10533 Intron space count: 989024 Average intron size: 93.9 bp Median intron size: 63 bp Minimum intron size: 2 bp Maximum intron size: 1676 bp Optionally, you can pass the path to a PNG file to be created using the --histogram parameter, which will generate a size distribution histogram with two overlaying plots - one representing the distribution of intergenic region sizes and the other the intron lengths. Because these can often have long tails, you can limit both the Y- and X-axes values with the --ylimit and --xlimit options, respectively. FASTA: If your FASTA isn't embedded at the end of your GFF3 file after a ##FASTA directive you'll need to specify the --fasta option in this script and pass it as a separate file. Definitions: Intergenic space was a little ambiguous to me as I started writing this. Does one count the space from the beginning of the contig until the first gene, or only between them? What about short contigs which have no annotated genes at all? From the Sequence Ontology: SO:0000605: A region containing or overlapping no genes that is bounded on either side by a gene, or bounded by a gene and the end of the chromosome. To my reading, this includes contig ends but not gene-less contigs. To that end, I include the former in intergenic space reporting but include the latter as a separate statistic. Author: Joshua Orvis (jorvis AT gmail) ''' parser = argparse.ArgumentParser( description='Reports statistics of reference gene coverage and extension by aligned RNA-seq transcript data.') ## output file to be written parser.add_argument('-i', '--input_gff3', type=str, required=True, help='GFF3 file of a reference annotation' ) parser.add_argument('-g', '--histogram', type=str, required=False, help='Optional path to a histogram of intron/intergenic space size distribution to be created (PNG)' ) parser.add_argument('-x', '--xlimit', type=int, required=False, help='Use this if you want to limit the X-axis of the histogram (feature length)' ) parser.add_argument('-y', '--ylimit', type=int, required=False, help='Use this if you want to limit the Y-axis of the histogram (feature count)' ) parser.add_argument('-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_gff3 ) if args.fasta is not None: seqs = biocodeutils.fasta_dict_from_file( args.fasta ) for seq_id in seqs: if seq_id in assemblies: assemblies[seq_id].residues = seqs[seq_id]['s'] assemblies[seq_id].length = len(assemblies[seq_id].residues) ## things to keep stats on and report total_molecule_count = len(assemblies) total_gene_count = 0 ## this number is NOT just the total genes N - 1, since there can be multiple molecules # genes can overlap, etc. total_intergenic_space_count = 0 total_intergenic_space_residues = 0 intergenic_distances = list() total_contig_residues = 0 empty_contig_residues = 0 total_intron_count = 0 total_intron_residues = 0 intron_sizes = list() ############################ ## Calculation section ############################ for asm_id in assemblies: #print("DEBUG: processing assembly: {0}".format(asm_id)) assembly = assemblies[asm_id] genes = sorted(assembly.genes()) total_gene_count += len(genes) previous_gene_loc = None # we should have a length here if assembly.length is None or assembly.length == 0: raise Exception("ERROR: Detected assembly with undefined or 0 length: {0}".format(assembly.id)) if total_gene_count == 0: empty_contig_residues += assembly.length continue total_contig_residues += assembly.length first_gene_loc = None last_gene_loc = None for gene in genes: gene_loc = gene.location_on(assembly) # if this is the first gene, track the number of bases from the start of the molecule here if first_gene_loc is None: total_intergenic_space_count += 1 intergenic_distance = gene_loc.fmin total_intergenic_space_residues += intergenic_distance intergenic_distances.append(intergenic_distance) first_gene_loc = gene_loc if previous_gene_loc is not None: ## skip this gene if it overlaps the previous if gene_loc.fmin < previous_gene_loc.fmax: if gene_loc.fmax > previous_gene_loc.fmax: previous_gene_loc = gene_loc else: total_intergenic_space_count += 1 intergenic_distance = gene_loc.fmin - previous_gene_loc.fmax total_intergenic_space_residues += intergenic_distance intergenic_distances.append(intergenic_distance) for mRNA in gene.mRNAs(): introns = mRNA.introns( on=assembly ) for intron in sorted(introns): total_intron_count += 1 intron_loc = intron.location_on(assembly) intron_size = intron_loc.fmax - intron_loc.fmin #if intron_size > 0: #print("\tDEBUG: found mRNA:{0} intron {1}-{2} ({3} bp)".format(mRNA.id, intron_loc.fmin, intron_loc.fmax, intron_size)) if intron_size < 0: print("\tWARN: Intron size ({1}) < 0 reported in gene {0}".format(gene.id, intron_size)) intron_sizes.append(intron_size) total_intron_residues += intron_size previous_gene_loc = gene_loc last_gene_loc = previous_gene_loc if last_gene_loc is not None: total_intergenic_space_count += 1 intergenic_distance = assembly.length - last_gene_loc.fmax total_intergenic_space_residues += intergenic_distance intergenic_distances.append(intergenic_distance) if total_intergenic_space_count == 0: avg_intergenic_space_dist = None intergenic_distances = None median_int_space_dist = None else: avg_intergenic_space_dist = total_intergenic_space_residues / total_intergenic_space_count intergenic_distances = sorted(intergenic_distances) median_int_space_dist = intergenic_distances[ int(len(intergenic_distances)/2) ] avg_intron_size = total_intron_residues / total_intron_count intron_sizes = sorted(intron_sizes) median_intron_size = intron_sizes[int(len(intron_sizes)/2)] ############################ ## Reporting section ############################ print("\nMolecule count: {0}".format(total_molecule_count)) print("Gene count: {0}".format(total_gene_count) ) print("\nTotal molecule bases: {0} bp".format(total_contig_residues) ) print("Empty molecule bases: {0} bp".format(empty_contig_residues) ) if total_intergenic_space_count > 0: print("Intergenic space count: {0}".format(total_intergenic_space_count) ) print("Average intergenic space distance: {0:.1f} bp".format(avg_intergenic_space_dist) ) print("Median intergenic space distance: {0} bp".format(median_int_space_dist) ) print("Minimum intergenic space distance: {0} bp".format(intergenic_distances[0]) ) print("Maximum intergenic space distance: {0} bp\n".format(intergenic_distances[-1]) ) else: print("There were no intergenic spaces found. This might mean there were no molecules with at least 2 genes.") print("Intron count: {0}".format(total_intron_count) ) print("Intron space count: {0}".format(total_intron_residues) ) print("Average intron size: {0:.1f} bp".format(avg_intron_size) ) print("Median intron size: {0} bp".format(median_intron_size) ) print("Minimum intron size: {0} bp".format(intron_sizes[0]) ) print("Maximum intron size: {0} bp\n".format(intron_sizes[-1]) ) ############################ ## Graphics section (optional) ############################ if args.histogram is not None: import matplotlib.pyplot as plt plt.xlabel('length (bp)') plt.ylabel('count') plt.title('Distribution of intron size and intergenic distances') plt.hist(intergenic_distances, bins=50, histtype='stepfilled', color='b', label='Intergenic distances' ) plt.hist(intron_sizes, bins=50, histtype='stepfilled', color='r', alpha=0.5, label='Intron sizes' ) if args.xlimit is not None: plt.xlim([0, args.xlimit]) if args.ylimit is not None: plt.ylim([0, args.ylimit]) plt.legend(loc='best') plt.savefig(args.histogram)