def main(): parser = argparse.ArgumentParser( description= 'Updates exon Parent attributes to point at the correct RNA feature') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file') parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write') args = parser.parse_args() infile = open(args.input) ofh = open(args.output, 'wt') last_rna_id = None for line in infile: if line.startswith('#'): ofh.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write("{0}\n".format(line)) continue id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') if cols[2].endswith('RNA'): last_rna_id = id ofh.write("{0}\n".format(line)) elif cols[2] == 'exon': if parent != last_rna_id: print( "INFO: correcting unexpected parentage for feature ({0}) type {2}. Expected ({1})" .format(id, last_rna_id, cols[2])) cols[8] = gff.set_column_9_value(cols[8], 'Parent', last_rna_id) ofh.write("{0}\n".format("\t".join(cols))) else: ofh.write("{0}\n".format(line)) else: ofh.write("{0}\n".format(line))
def main(): flawed_gff_file = 'canonical.flawed.gff3' ilri_gff = 'Theileria-all-Theileria1_ourids.gff' source = 'GenBank' out_gff = 'canonical.corrected.gff3' fout = open(out_gff, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") (assemblies, features) = gff.get_gff3_features(flawed_gff_file) print("INFO: loaded {0} assemblies and {1} features".format(len(assemblies), len(features))) polypeptides = dict() for line in open(ilri_gff): cols = line.split("\t") if len(cols) != 9 or cols[2] != 'polypeptide': continue id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') polypeptides[parent] = things.Polypeptide(id=id, parent=parent) polypeptides[parent].locate_on(target=assemblies[cols[0]], fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) print("DEBUG: loaded {0} polypeptides from ILRI file".format(len(polypeptides)) ) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): if mRNA.id not in polypeptides: print("DEBUG: {0} not found as a parent to any polypeptide".format(mRNA.id)) else: polypeptide = polypeptides[mRNA.id] # pull this outside of the iteration since iterating might delete some CDSs = mRNA.CDSs() for CDS in CDSs: keep = True if CDS < polypeptide: mRNA.delete_CDS(CDS) elif CDS <= polypeptide: CDS.location().fmin = polypeptide.location().fmin if CDS > polypeptide: mRNA.delete_CDS(CDS) elif CDS >= polypeptide: CDS.location().fmax = polypeptide.location().fmax #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \ # CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \ # polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax)) gene.print_as(fh=fout, source=source, format='gff3')
def main(): parser = argparse.ArgumentParser( description='Generates new identifiers in GFF3 files following the IGS identifier convention.') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='TA file of source molecules' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' ) parser.add_argument('-p', '--prefix', type=str, required=True, help='The prefix portion of IDs to be generated') parser.add_argument('-m', '--mode', type=str, required=False, default='sequential', help='ID modes (see embedded documentation): sequential, uuid, hex8, hex12') args = parser.parse_args() check_arguments(args) id_map = dict() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') for line in open(args.input_file): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: fout.write(line + "\n") continue # grab the ID column if any id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') new_id = None new_parent = None type = cols[2] if id is not None: if id in id_map: new_id = id_map[id] else: new_id = get_new_id(args.prefix, type, args.mode) id_map[id] = new_id cols[8] = cols[8].replace("ID={0}".format(id), "ID={0}".format(new_id)) if parent is not None: if parent in id_map: new_parent = id_map[parent] else: raise Exception("ERROR: parent ({0}) referenced before it was used as an ID".format(parent)) cols[8] = cols[8].replace("Parent={0}".format(parent), "Parent={0}".format(new_parent)) #print("DEBUG: old_id:{0} - old_parent:{1}, new_id:{2} - new_parent:{3}".format(id, parent, new_id, new_parent)) fout.write("\t".join(cols) + "\n")
def main(): parser = argparse.ArgumentParser('Filter the genes of a GFF3 file by mRNA child IDs') ## output file to be written parser.add_argument('-i', '--input_gff', type=str, required=True, help='GFF file of source annotation' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') current_mRNA_id = None current_mol_id = None current_fragments = list() current_direction = None for line in open(args.input_gff): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: continue # grab the ID and Parent columns if any id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') mol_id = cols[0] type = cols[2] if type == 'mRNA': if current_mRNA_id is not None and id != current_mRNA_id: # purge the existing one first write_transcript(fout, current_mol_id, current_fragments, current_direction) current_fragments = list() current_mRNA_id = id current_mol_id = cols[0] current_direction = cols[6] elif type == 'exon': if cols[6] == '+': current_fragments.append({'start':cols[3], 'end':cols[4]}) else: current_fragments.append({'start':cols[4], 'end':cols[3]}) write_transcript(fout, current_mol_id, current_fragments, current_direction)
def main(): parser = argparse.ArgumentParser( description='Adds gene features for RNAs which lack them') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file') parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write') args = parser.parse_args() infile = open(args.input) ofh = open(args.output, 'wt') for line in infile: if line.startswith('#'): ofh.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write("{0}\n".format(line)) continue id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') if cols[2].endswith('RNA') and parent is None: gene_cols = list(cols) gene_cols[2] = 'gene' gene_cols[8] = gff.set_column_9_value(gene_cols[8], 'ID', "{0}.gene".format(id)) ofh.write("{0}\n".format("\t".join(gene_cols))) cols[8] = gff.set_column_9_value(cols[8], 'Parent', "{0}.gene".format(id)) ofh.write("{0}\n".format("\t".join(cols))) else: ofh.write("{0}\n".format(line))
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' ) parser.add_argument('-o', '--output_gff', type=str, required=True, help='Path to an output GFF file to be created with new IDs' ) parser.add_argument('-p', '--id_prefix', type=str, required=True, help='Will be used as the base for all IDs generated' ) parser.add_argument('-m', '--output_map', type=str, required=False, help='This will create a tab-delimited mapping of old and new IDs' ) args = parser.parse_args() ofh = open(args.output_gff, 'w') if args.output_map is None: map_ofh = None else: map_ofh = open(args.output_map, 'w') idmap = dict() for line in open(args.input_file): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write(line + "\n") continue feat_id = gff.column_9_value(cols[8], 'ID') parent_id = gff.column_9_value(cols[8], 'Parent') if feat_id in idmap: new_feat_id = idmap[feat_id] else: new_feat_id = get_new_id(args.id_prefix, cols[2], feat_id, map_ofh) idmap[feat_id] = new_feat_id if parent_id is None: cols[8] = "ID={0}".format(new_feat_id) else: if parent_id in idmap: new_parent_id = idmap[parent_id] else: new_parent_id = get_new_id(args.id_prefix, cols[2], parent_id, map_ofh) idmap[parent_id] = new_parent_id cols[8] = "ID={0};Parent={1}".format(new_feat_id, new_parent_id) ofh.write( "\t".join(cols) + "\n" )
def append_organism_names_to_gff(file_path, poly_orgs): # we have to write to a temp file and copy over fout = open("{0}.orgtmp".format(file_path), 'wt') orgs_found = 0 last_RNA_id = None for line in open(file_path): line = line.rstrip() cols = line.split("\t") if len(cols) == 9 and cols[2].endswith('RNA'): last_RNA_id = gff.column_9_value(cols[8], 'ID') if len(cols) == 9 and cols[2] == 'polypeptide': if last_RNA_id in poly_orgs: cols[8] += ";top_organism_from_blast={0}".format( poly_orgs[last_RNA_id], gff.escape(poly_orgs[last_RNA_id])) orgs_found += 1 fout.write("{0}\n".format("\t".join(cols))) else: fout.write("{0}\n".format(line)) if orgs_found == 0: print( "WARNING: The --export_organism_names option was passed, but parsing failed to find any organism names at all. This might be an error." ) ## now move the temp file over the original copy fout.close() os.rename("{0}.orgtmp".format(file_path), file_path)
def append_organism_names_to_gff(file_path, poly_orgs): # we have to write to a temp file and copy over fout = open("{0}.orgtmp".format(file_path), 'wt') orgs_found = 0 last_RNA_id = None for line in open(file_path): line = line.rstrip() cols = line.split("\t") if len(cols) == 9 and cols[2].endswith('RNA'): last_RNA_id = gff.column_9_value(cols[8], 'ID') if len(cols) == 9 and cols[2] == 'polypeptide': if last_RNA_id in poly_orgs: cols[8] += ";top_organism_from_blast={0}".format(poly_orgs[last_RNA_id], gff.escape(poly_orgs[last_RNA_id])) orgs_found += 1 fout.write("{0}\n".format("\t".join(cols)) ) else: fout.write("{0}\n".format(line)) if orgs_found == 0: print("WARNING: The --export_organism_names option was passed, but parsing failed to find any organism names at all. This might be an error.") ## now move the temp file over the original copy fout.close() os.rename("{0}.orgtmp".format(file_path), file_path)
def main(): parser = argparse.ArgumentParser( description='Reverses CDS coodinates where stop < start') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' ) parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' ) args = parser.parse_args() infile = open(args.input) ofh = open(args.output, 'wt') for line in infile: if line.startswith('#'): ofh.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write("{0}\n".format(line) ) continue if cols[2] == 'CDS' and int(cols[4]) < int(cols[3]): temp = cols[3] cols[3] = cols[4] cols[4] = temp id = gff.column_9_value(cols[8], 'ID') print("CDS reversed: {0}".format(id)) ofh.write("{0}\n".format("\t".join(cols)) ) else: ofh.write("{0}\n".format(line) )
def main(): parser = argparse.ArgumentParser( description='Updates exon Parent attributes to point at the correct RNA feature') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' ) parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' ) args = parser.parse_args() infile = open(args.input) ofh = open(args.output, 'wt') last_rna_id = None for line in infile: if line.startswith('#'): ofh.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write("{0}\n".format(line) ) continue id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') if cols[2].endswith('RNA'): last_rna_id = id ofh.write("{0}\n".format(line) ) elif cols[2] == 'exon': if parent != last_rna_id: print("INFO: correcting unexpected parentage for feature ({0}) type {2}. Expected ({1})".format(id, last_rna_id, cols[2]) ) cols[8] = gff.set_column_9_value(cols[8], 'Parent', last_rna_id) ofh.write("{0}\n".format("\t".join(cols)) ) else: ofh.write("{0}\n".format(line) ) else: ofh.write("{0}\n".format(line) )
def main(): parser = argparse.ArgumentParser( description='Adds gene features for RNAs which lack them') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' ) parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' ) args = parser.parse_args() infile = open(args.input) ofh = open(args.output, 'wt') for line in infile: if line.startswith('#'): ofh.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write("{0}\n".format(line) ) continue id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') if cols[2].endswith('RNA') and parent is None: gene_cols = list(cols) gene_cols[2] = 'gene' gene_cols[8] = gff.set_column_9_value(gene_cols[8], 'ID', "{0}.gene".format(id)) ofh.write("{0}\n".format("\t".join(gene_cols)) ) cols[8] = gff.set_column_9_value(cols[8], 'Parent', "{0}.gene".format(id)) ofh.write("{0}\n".format("\t".join(cols)) ) else: ofh.write("{0}\n".format(line) )
def main(): parser = argparse.ArgumentParser( description='Removes duplicate features in a GFF3 file') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' ) parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' ) args = parser.parse_args() # just reduce the keys to a string: # "molecule__parent__type__start__stop" found = list() infile = open(args.input) outfile = open(args.output, 'wt') for line in infile: if line.startswith('#'): outfile.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: continue parent = gff.column_9_value(cols[8], 'Parent') type = cols[2] mol_id = cols[0] if parent is None: outfile.write("{0}\n".format(line)) continue id_string = "{0}__{1}__{2}__{3}__{4}".format(mol_id, parent, type, cols[3], cols[4]) if id_string in found: print("INFO: duplicate feature to be removed:\n{0}\n".format(line) ) continue else: found.append(id_string) outfile.write("{0}\n".format(line) )
def main(): parser = argparse.ArgumentParser( description='Reverses CDS coodinates where stop < start') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file') parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write') args = parser.parse_args() infile = open(args.input) ofh = open(args.output, 'wt') for line in infile: if line.startswith('#'): ofh.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write("{0}\n".format(line)) continue if cols[2] == 'CDS' and int(cols[4]) < int(cols[3]): temp = cols[3] cols[3] = cols[4] cols[4] = temp id = gff.column_9_value(cols[8], 'ID') print("CDS reversed: {0}".format(id)) ofh.write("{0}\n".format("\t".join(cols))) else: ofh.write("{0}\n".format(line))
def main(): parser = argparse.ArgumentParser( description='Convert native (GTF) or GFF output from Augustus into GFF3 format') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Augustus' ) parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' ) args = parser.parse_args() assemblies = dict() current_assembly = None gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() ## Used for tracking the exon count for each gene (for ID purposes) exon_count_by_mRNA = dict() fout = open(args.output, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") for line in open(args.input): if line.startswith("#"): current_gene_comment_lines.append(line) if line.startswith("# end gene "): ## purge the comments, then write the gene fout.write( "".join(current_gene_comment_lines) ) gene.print_as(fh=fout, source='AUGUSTUS', format='gff3') gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() elif line.startswith("# protein sequence = ["): pass elif in_sequence is True: # build 'current_sequence' pass else: cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] if feat_type not in ['gene', 'transcript', 'CDS']: continue ## The output format is GTF by default and (mostly) GFF if the --gff option is used. # If GTF is detected, let's start by transforming the 9th column into GFF so the # libraries can use it # g1 -> ID=g1 # g1.t1 -> ID=g1.t1;Parent=g1 # transcript_id "g1.t1"; gene_id "g1"; -> ID=g1.t1.cds;Parent=g1.t1 m_gene = re.match('(g\d+)', cols[8]) m_transcript = re.match('((g\d+).t\d+)', cols[8]) m_CDS = re.match('transcript_id "(g\d+.t\d+)"; gene_id "g\d+";', cols[8]) # the input can be in GTF or GFF. We need to reformat the 9th column for the GTF entries if not cols[8].startswith('ID') and not cols[8].startswith('Parent'): if feat_type == 'gene': if m_gene: cols[8] = "ID={0}".format(m_gene.group(1)) else: raise Exception("ERROR: GTF detected but gene row has bad 9th column format: {0}".format(cols[8])) elif feat_type == 'transcript': if m_transcript: cols[8] = "ID={0};Parent={1}".format(m_transcript.group(1), m_transcript.group(2)) else: raise Exception("ERROR: GTF detected but transcript row has bad 9th column format: {0}".format(cols[8])) elif feat_type == 'CDS': if m_CDS: cols[8] = "ID={0}.cds;Parent={0}".format(m_CDS.group(1)) else: raise Exception("ERROR: GTF detected but CDS row has bad 9th column format: {0}".format(cols[8])) feat_id = gff.column_9_value(cols[8], 'ID') ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] if feat_type == "gene": gene = things.Gene(id=feat_id) gene.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) elif feat_type == "transcript": mRNA = things.mRNA(id=feat_id, parent=gene) mRNA.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) gene.add_mRNA(mRNA) mRNAs[mRNA.id] = mRNA if feat_id in exon_count_by_mRNA: raise Exception( "ERROR: two different mRNAs found with same ID: {0}".format(feat_id) ) else: exon_count_by_mRNA[feat_id] = 0 elif feat_type == "CDS": parent_id = gff.column_9_value(cols[8], 'Parent') ## sanity check that we've seen this parent if parent_id not in mRNAs: raise Exception("ERROR: Found CDS column with parent ({0}) mRNA not yet in the file".format(parent_id)) CDS = things.CDS(id=feat_id, parent=mRNAs[parent_id]) CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]) ) mRNA.add_CDS(CDS) ## exons weren't explicitly defined in the input file, so we need to derive new IDs for them exon_count_by_mRNA[parent_id] += 1 exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id]) exon = things.Exon(id=exon_id, parent=mRNAs[parent_id]) exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) mRNA.add_exon(exon)
def main(): parser = argparse.ArgumentParser(description="Adds locus tag identifiers to GFF3 features") ## output file to be written parser.add_argument("-i", "--input_file", type=str, required=True, help="TA file of source molecules") parser.add_argument("-o", "--output_file", type=str, required=False, help="Optional output file path (else STDOUT)") parser.add_argument("-p", "--prefix", type=str, required=True, help="The prefix portion of IDs to be generated") parser.add_argument( "-a", "--padding", type=int, required=True, help="Specify the minimum with to reserve for the numeric portion of the IDs. Smaller numbers will be zero-padded.", ) parser.add_argument( "-n", "--interval", type=int, required=False, default=1, help="Interval between generated identifiers" ) parser.add_argument( "-s", "--starting_id", type=int, required=False, default=0, help="Initial numeric portion of IDs to be generated (do not zero-pad)", ) parser.add_argument( "-d", "--id_file", type=str, required=False, help="Pass a 2-column file of IDs to retain (in case you have mapped genes, for example)", ) parser.add_argument( "-m", "--molecule_map", type=str, required=False, help="Pass a 2-column file of molecule->token identifiers (see documentation)", ) parser.add_argument( "-c", "--custom", type=str, required=False, help="For custom parsing steps. Most should ignore this." ) args = parser.parse_args() check_arguments(args) # used to store locus_tags associated with each gene (so children can inherit) gene_loci = dict() next_id = args.starting_id last_molecule = None id_mapping = parse_mapping_file(args.id_file) mol_mapping = parse_mapping_file(args.molecule_map) loci_assigned = list() ## if using Joana's custom options, check assumptions if args.custom == "joana": if args.molecule_map is None or args.id_file is None: raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=joana") else: ## need to process the ID map to reformat IDs for id in id_mapping: # TP05_0002 -> TpMuguga_05g00002 m = re.match("TP(\d\d)_(\d+)", id_mapping[id]) if m: id_mapping[id] = "{0}_{1}g0{2}".format(args.prefix, m.group(1), m.group(2)) elif args.custom == "bmicroti": microti_map = {"I": "01", "II": "02", "III": "03", "IV": "04"} if args.molecule_map is None or args.id_file is None: raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=bmicroti") else: for id in id_mapping: m = re.match("BBM_(\D+)(\d+)", id_mapping[id]) if m: print("Changing id from {0} to ".format(id)) id_mapping[id] = "{0}_{1}g{2}".format(args.prefix, microti_map[m.group(1)], m.group(2)) print(id_mapping[id]) else: raise Exception("ERROR: id ({0}) didn't match expected convention.".format(id_mapping[id])) ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, "wt") last_number_portion_assigned = 0 for line in open(args.input_file): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: fout.write(line + "\n") continue if last_molecule is None or ( args.molecule_map is not None and mol_mapping[cols[0]] != mol_mapping[last_molecule] ): print("Found molecule {0}, resetting id counter from {1}".format(cols[0], next_id)) next_id = args.starting_id last_molecule = cols[0] # grab the ID column if any id = gff.column_9_value(cols[8], "ID") parent = gff.column_9_value(cols[8], "Parent") type = cols[2] # issue # 66F4EEF2E3C863C251F831817FF71233 # 7F1917E4D81A959078C9A38E15488BC0 # E22888670919A4A888572155F40F2654 # B9D9CF1F7A8E5A2E1124F0A6C68840DC -> BBM_I00232 # gene before is: 6DE6BCCE69CCDC39994A0940B2ED524A - novel # errors on: BmicrotiR1_01g00233 -> BBM_I00233 # 5800A4110A62E4EAE57AFAD1F8D65CB3 BBM_I00233 if type == "gene": while True: if id in id_mapping: locus_id = id_mapping[id] else: if args.molecule_map is None: locus_id = "{0}_{1}".format(args.prefix, str(next_id).zfill(args.padding)) else: if cols[0] in mol_mapping: if args.custom == "bmicroti": locus_id = "{0}_{2}g{1}".format( args.prefix, str(int(last_number_portion_assigned) + 1).zfill(args.padding), mol_mapping[cols[0]], ) else: locus_id = "{0}_{2}g{1}".format( args.prefix, str(next_id).zfill(args.padding), mol_mapping[cols[0]] ) else: raise Exception("ERROR: --molecule_map passed but {0} wasn't found in it.".format(cols[0])) next_id += args.interval cols[8] = gff.set_column_9_value(cols[8], "locus_tag", locus_id) ## make sure this wasn't generated already (possibly conflict between --id_file and an # auto-generated ID? if locus_id not in loci_assigned: break else: print("DEBUG: Duplicate ID assigned ({0}), trying again.".format(locus_id)) loci_assigned.append(locus_id) gene_loci[id] = locus_id m = re.search(r"(\d+)$", locus_id) if m: last_number_portion_assigned = m.group(1) elif type.endswith("RNA"): if parent in gene_loci: cols[8] = gff.set_column_9_value(cols[8], "locus_tag", gene_loci[parent]) else: raise Exception("ERROR: found RNA {0} whose parent {1} wasn't found yet".format(id, parent)) fout.write("\t".join(cols) + "\n")
def main(): parser = argparse.ArgumentParser( description='A GTF -> GFF3 conversion script for Cufflinks output') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GTF file' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' ) parser.add_argument('-e', '--export_mode', type=str, required=False, default='model', help='Export mode for results (model or cDNA_match)' ) args = parser.parse_args() if args.export_mode not in ['model', 'cDNA_match']: raise Exception("ERROR: the only valid values for --export_mode are 'model' or 'cDNA_match'") ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None current_match = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) # each gb_record is a SeqRecord object for line in open(args.input_file, "r"): cols = line.split("\t") if len(cols) != 9: print("SKIPPING: {0}".format(line)) continue mol_id = cols[0] if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] ftype = cols[2] fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] col9 = cols[8] # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key) col9 = col9.replace(' "', '="') gene_id = gff.column_9_value(col9, 'gene_id').replace('"', '') transcript_id = gff.column_9_value(col9, 'transcript_id').replace('"', '') if ftype == 'transcript': if args.export_mode == 'model': if current_gene is not None and current_gene.id != gene_id: gene.print_as(fh=ofh, source='Cufflinks', format='gff3') if current_gene is None or current_gene.id != gene_id: gene = things.Gene(id=gene_id) gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_gene = gene mRNA = things.mRNA(id=transcript_id, parent=current_gene) mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_mRNA(mRNA) current_RNA = mRNA exon_count_by_RNA[transcript_id] = 0 current_CDS_phase = 0 elif args.export_mode == 'cDNA_match': if current_match is not None and current_match.id != transcript_id: match.print_as( fh=ofh, source='Cufflinks', format='gff3' ) match = things.Match(id=transcript_id, subclass='cDNA_match', length=fmax - fmin) match.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_match = match elif ftype == 'exon': exon_number = gff.column_9_value(col9, 'exon_number').replace('"', '') if args.export_mode == 'model': exon_count_by_RNA[transcript_id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) CDS = things.CDS(id=cds_id, parent=current_RNA) CDS.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand, phase=current_CDS_phase ) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) current_CDS_phase = 3 - (((fmax - fmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) exon = things.Exon(id=exon_id, parent=current_RNA) exon.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_RNA.add_exon(exon) elif args.export_mode == 'cDNA_match': mp_id = "{0}.match_part.{1}".format(transcript_id, exon_number) mp = things.MatchPart(id=mp_id, parent=current_match, length=fmax - fmin) mp.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_match.add_part(mp) # don't forget to do the last gene, if there were any if args.export_mode == 'model': if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') elif args.export_mode == 'cDNA_match': if current_match is not None: match.print_as( fh=ofh, source='Cufflinks', format='gff3' )
def main(): parser = argparse.ArgumentParser( description='Removes orphaned features in a GFF3 file') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file') parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write') #parser.add_argument('-t', '--type', type=str, required=False, help='Type of features to remove' ) args = parser.parse_args() # going to try saving memory by tracking line numbers instead of storing all of it # true means keep the line, false means to omit it # doing tracking this way since it's technically legal for a feature to have no identifier at all. lines = list() parents = dict() current_line_num = -1 infile = open(args.input) for line in infile: current_line_num += 1 if line.startswith('#'): lines.append(True) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: lines.append(True) continue id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') if parent is None: # this might be overwritten later lines.append(False) if id is not None: if parent not in parents: parents[parent] = False else: lines.append(True) parents[parent] = True infile.seek(0) current_line_num = -1 outfh = open(args.output, 'wt') for line in infile: current_line_num += 1 if lines[current_line_num] == True: outfh.write(line) else: line = line.rstrip() cols = line.split("\t") if len(cols) == 9: id = gff.column_9_value(cols[8], 'ID') if id is not None and id in parents and parents[id] == True: outfh.write("{0}\n".format(line)) else: print("WARN: removing this line: {0}".format(line))
def main(): parser = argparse.ArgumentParser( description='A GTF -> GFF3 conversion script for Cufflinks output') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GTF file') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created') parser.add_argument('-e', '--export_mode', type=str, required=False, default='model', help='Export mode for results (model or cDNA_match)') args = parser.parse_args() if args.export_mode not in ['model', 'cDNA_match']: raise Exception( "ERROR: the only valid values for --export_mode are 'model' or 'cDNA_match'" ) ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None current_match = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) # each gb_record is a SeqRecord object for line in open(args.input_file, "r"): cols = line.split("\t") if len(cols) != 9: print("SKIPPING: {0}".format(line)) continue mol_id = cols[0] if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] ftype = cols[2] fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] col9 = cols[8] # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key) col9 = col9.replace(' "', '="') gene_id = gff.column_9_value(col9, 'gene_id').replace('"', '') transcript_id = gff.column_9_value(col9, 'transcript_id').replace('"', '') if ftype == 'transcript': if args.export_mode == 'model': if current_gene is not None and current_gene.id != gene_id: gene.print_as(fh=ofh, source='Cufflinks', format='gff3') if current_gene is None or current_gene.id != gene_id: gene = things.Gene(id=gene_id) gene.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_gene = gene mRNA = things.mRNA(id=transcript_id, parent=current_gene) mRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_mRNA(mRNA) current_RNA = mRNA exon_count_by_RNA[transcript_id] = 0 current_CDS_phase = 0 elif args.export_mode == 'cDNA_match': if current_match is not None and current_match.id != transcript_id: match.print_as(fh=ofh, source='Cufflinks', format='gff3') match = things.Match(id=transcript_id, subclass='cDNA_match', length=fmax - fmin) match.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_match = match elif ftype == 'exon': exon_number = gff.column_9_value(col9, 'exon_number').replace('"', '') if args.export_mode == 'model': exon_count_by_RNA[transcript_id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id]) CDS = things.CDS(id=cds_id, parent=current_RNA) CDS.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand, phase=current_CDS_phase) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) current_CDS_phase = 3 - (( (fmax - fmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id]) exon = things.Exon(id=exon_id, parent=current_RNA) exon.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_RNA.add_exon(exon) elif args.export_mode == 'cDNA_match': mp_id = "{0}.match_part.{1}".format(transcript_id, exon_number) mp = things.MatchPart(id=mp_id, parent=current_match, length=fmax - fmin) mp.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_match.add_part(mp) # don't forget to do the last gene, if there were any if args.export_mode == 'model': if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') elif args.export_mode == 'cDNA_match': if current_match is not None: match.print_as(fh=ofh, source='Cufflinks', format='gff3')
def main(): parser = argparse.ArgumentParser(description="Put a description of your script here") parser.add_argument("-a", "--organism1_annotation", type=str, required=True, help="Annotation GFF for organism 1") parser.add_argument( "-p", "--organism1_aat_alignments", type=str, required=True, help="Path to AAT GFF3 (match/match_part)" ) parser.add_argument( "-aatdb", "--aat_fasta_db", type=str, required=True, help="Path to FASTA database that was used in AAT" ) parser.add_argument( "-b", "--organism1_blast_alignments", type=str, required=True, help="Path to BLASTp btab file vs.organism 2 proteins", ) parser.add_argument( "-be", "--blast_eval_cutoff", type=float, required=False, default=1e-5, help="BLAST e-value cutoff" ) parser.add_argument( "-bpi", "--blast_percent_identity_cutoff", type=float, required=False, default=0, help="BLAST %identity cutoff" ) parser.add_argument( "-ppc", "--aat_percent_coverage_cutoff", type=float, required=False, default=0, help="% coverage of the query protein by the AAT match", ) parser.add_argument( "-o", "--output_id_list", type=str, required=False, help="List of IDs from organism1 that passed" ) args = parser.parse_args() debugging_transcript = None ## if the output file wasn't passed build one from the other parameters if args.output_id_list is None: args.output_id_list = "training_ids.be_{0}.bpi_{1}.ppc_{2}.list".format( args.blast_eval_cutoff, args.blast_percent_identity_cutoff, args.aat_percent_coverage_cutoff ) print("INFO: Parsing organism1 annotation") (assemblies, features) = gff.get_gff3_features(args.organism1_annotation) print("INFO: Parsing AAT FASTA database") aat_seqs = utils.fasta_dict_from_file(args.aat_fasta_db) # keys are assembly IDs, value for each is a list of matches on them aat_matches = dict() aat_match_count = 0 current_match = None ## IDs of features in organism 1 which overlap AAT o1_with_aat = list() o1_with_o2 = list() print("INFO: Parsing organism1 AAT protein alignments") for line in open(args.organism1_aat_alignments): cols = line.split("\t") if line.startswith("#") or len(cols) != 9: continue assembly_id = cols[0] # skip this match if there were not predicted genes on the same assembly if assembly_id not in assemblies: continue if assembly_id not in aat_matches: aat_matches[assembly_id] = list() fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] feature_id = gff.column_9_value(cols[8], "ID").replace('"', "") target = gff.column_9_value(cols[8], "Target") m = re.search("^(\S+)", target) if m: target = m.group(1) if cols[2] == "nucleotide_to_protein_match": if current_match is not None: aat_matches[assembly_id].append(current_match) aat_match_count += 1 current_match = things.Match( id=feature_id, target_id=target, subclass="nucleotide_to_protein_match", length=fmax - fmin ) current_match.locate_on(target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand) elif cols[2] == "match_part": parent_id = gff.column_9_value(cols[8], "Parent").replace('"', "") match_part = things.MatchPart(id=feature_id, parent=parent_id, length=fmax - fmin) match_part.locate_on(target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand) current_match.add_part(match_part) print("INFO: Parsed {0} protein alignment chains".format(aat_match_count)) print("INFO: Comparing organism1's mRNAs with AAT match coordinates") for assembly_id in assemblies: if assembly_id not in aat_matches: continue assembly = assemblies[assembly_id] for gene in assembly.genes(): for mRNA in gene.mRNAs(): if debugging_transcript is not None: if mRNA.id == debugging_transcript: print("DEBUG: processing debugging transcript: {0}".format(mRNA.id)) else: continue for aat_match in aat_matches[assembly_id]: # print("DEBUG: about to call overlap_size_with {0} and {1}, which has {2} segments".format(mRNA.id, aat_match.id, len(aat_match.parts)) ) overlap_size = mRNA.overlap_size_with(aat_match) if overlap_size is not None: # print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4})".format(mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length) ) # this shouldn't be possible, but check just in case if overlap_size > mRNA.length: raise Exception( "ERROR: overlap size ({0}) > mRNA length ({1})".format(overlap_size, mRNA.length) ) if aat_match.target_id not in aat_seqs: raise Exception( "ERROR: Found match with target ID ({0}) but didn't find a FASTA entry for it via -aatdb".format( aat_match.target_id ) ) # this is a protein length, so x3 match_target_length = len(aat_seqs[aat_match.target_id]["s"]) * 3 (mRNA_percent_coverage, target_percent_coverage) = calculate_fragmented_coverage( mRNA, aat_match, match_target_length ) # print("DEBUG: mRNA_percent_coverage:{0}".format(mRNA_percent_coverage) ) # print("DEBUG: match_percent_coverage:{0}".format(target_percent_coverage) ) if ( mRNA_percent_coverage >= args.aat_percent_coverage_cutoff and target_percent_coverage >= args.aat_percent_coverage_cutoff ): o1_with_aat.append(mRNA.id) # print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4}), match target id:{5}, length:{6}".format( \ # mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length, \ # aat_match.target_id, match_target_length) ) # print("\tmRNA % cov: {0}".format(mRNA_percent_coverage)) # print("\ttarget % cov: {0}".format(target_percent_coverage)) break # only need to see if one matched print("INFO: Found {0} mRNAs in org1 with overlapping fungi AAT coordinates".format(len(o1_with_aat))) # key=org1_transcript_id, value=org2_transcript_id top_blast_hits = dict() print("INFO: parsing BLAST results vs. org2") for line in open(args.organism1_blast_alignments): cols = line.split("\t") if float(cols[19]) > args.blast_eval_cutoff: continue if float(cols[10]) < args.blast_percent_identity_cutoff: continue # if we survived until here, this one's good. top_blast_hits[cols[0]] = cols[5] print("INFO: Comparing overlap between AAT-matched proteins and BLAST ones") for o1_mRNA_id in o1_with_aat: if o1_mRNA_id in top_blast_hits: o1_with_o2.append(o1_mRNA_id) print( "INFO: Found {0} mRNAs in org1 with overlapping AAT coordinates and BLAST hit to org2".format(len(o1_with_o2)) ) id_list_fh = open(args.output_id_list, "wt") for mRNA_id in o1_with_o2: id_list_fh.write("{0}\n".format(mRNA_id))
last_gene = None for qry_gene in things: if qry_gene.id in handled_ids: continue ## mark this one as handled handled_ids[qry_gene.id] = 1 nonoverlapping_set.append(qry_gene) <<<<<<< .mine current_assembly = assemblies[mol_id] rfmin = int(cols[3]) - 1 rfmax = int(cols[4]) rstrand = None feat_id = gff.column_9_value(cols[8], 'ID') parent_id = gff.column_9_value(cols[8], 'Parent') parent_feat = None if parent_id is not None: if parent_id in features: parent_feat = features[parent_id] else: raise Exception("Error in GFF3: Parent {0} referenced by a child feature before it was defined".format(parent_id) ) #print("Processing feature: ({0})".format(feat_id)) if cols[6] == '-': strand = -1 elif cols[6] == '+': strand = 1
def main(): parser = argparse.ArgumentParser( description='Convert PASA GFF file to canonical gene models') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by PASA' ) parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-s', '--source', type=str, required=False, default='PASA', help='Value to use for the 2nd (source) column' ) args = parser.parse_args() assemblies = dict() current_assembly = None gene = None mRNA = None gene_fmin = None gene_fmax = None gene_strand = None ## Used for tracking the exon count for each gene (for ID purposes) exon_count_by_mRNA = dict() fout = open(args.output, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") for line in open(args.input): cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_id = gff.column_9_value(cols[8], 'ID') # we expect all columns to be cDNA_match if feat_type != 'cDNA_match': raise Exception("ERROR: expected all columns to be of type 'cDNA_match' but found a {0}".format(feat_type)) ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) if gene is None or feat_id != gene.id: if gene is not None: # finish the previous one first mRNA.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand ) gene.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand ) gene.add_mRNA(mRNA) current_assembly.add_gene( gene ) gene.print_as(fh=fout, source=args.source, format='gff3') # now start a new one gene = things.Gene(id=feat_id) mRNA = things.mRNA(id="{0}.mRNA".format(feat_id), parent=gene) exon_count_by_mRNA[mRNA.id] = 0 gene_fmin = int(cols[3]) - 1 gene_fmax = int(cols[4]) gene_strand = cols[6] current_assembly = assemblies[mol_id] # each row is a new CDS/exon for the current mRNA CDS = things.CDS(id="{0}.CDS".format(feat_id), parent=mRNA.id) # FIX THIS PHASE CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase='.' ) mRNA.add_CDS(CDS) exon_count_by_mRNA[mRNA.id] += 1 exon_id = "{0}.exon{1}".format(mRNA.id, exon_count_by_mRNA[mRNA.id]) exon = things.Exon(id=exon_id, parent=mRNA.id) exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) mRNA.add_exon(exon) if int(cols[3]) - 1 < gene_fmin: gene_fmin = int(cols[3]) - 1 if int(cols[4]) > gene_fmax: gene_fmax = int(cols[4]) # don't orphan the last one if gene is not None: # finish the previous one first mRNA.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand ) gene.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand ) gene.add_mRNA(mRNA) current_assembly.add_gene( gene ) gene.print_as(fh=fout, source=args.source, format='gff3')
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') parser.add_argument('-a', '--organism1_annotation', type=str, required=True, help='Annotation GFF for organism 1') parser.add_argument('-p', '--organism1_aat_alignments', type=str, required=True, help='Path to AAT GFF3 (match/match_part)') parser.add_argument('-aatdb', '--aat_fasta_db', type=str, required=True, help='Path to FASTA database that was used in AAT') parser.add_argument('-b', '--organism1_blast_alignments', type=str, required=True, help='Path to BLASTp btab file vs.organism 2 proteins') parser.add_argument('-be', '--blast_eval_cutoff', type=float, required=False, default=1e-5, help='BLAST e-value cutoff') parser.add_argument('-bpi', '--blast_percent_identity_cutoff', type=float, required=False, default=0, help='BLAST %identity cutoff') parser.add_argument( '-ppc', '--aat_percent_coverage_cutoff', type=float, required=False, default=0, help='% coverage of the query protein by the AAT match') parser.add_argument('-o', '--output_id_list', type=str, required=False, help='List of IDs from organism1 that passed') args = parser.parse_args() debugging_transcript = None ## if the output file wasn't passed build one from the other parameters if args.output_id_list is None: args.output_id_list = "training_ids.be_{0}.bpi_{1}.ppc_{2}.list".format( args.blast_eval_cutoff, args.blast_percent_identity_cutoff, args.aat_percent_coverage_cutoff) print("INFO: Parsing organism1 annotation") (assemblies, features) = gff.get_gff3_features(args.organism1_annotation) print("INFO: Parsing AAT FASTA database") aat_seqs = utils.fasta_dict_from_file(args.aat_fasta_db) # keys are assembly IDs, value for each is a list of matches on them aat_matches = dict() aat_match_count = 0 current_match = None ## IDs of features in organism 1 which overlap AAT o1_with_aat = list() o1_with_o2 = list() print("INFO: Parsing organism1 AAT protein alignments") for line in open(args.organism1_aat_alignments): cols = line.split("\t") if line.startswith('#') or len(cols) != 9: continue assembly_id = cols[0] # skip this match if there were not predicted genes on the same assembly if assembly_id not in assemblies: continue if assembly_id not in aat_matches: aat_matches[assembly_id] = list() fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] feature_id = gff.column_9_value(cols[8], 'ID').replace('"', '') target = gff.column_9_value(cols[8], 'Target') m = re.search("^(\S+)", target) if m: target = m.group(1) if cols[2] == 'nucleotide_to_protein_match': if current_match is not None: aat_matches[assembly_id].append(current_match) aat_match_count += 1 current_match = things.Match( id=feature_id, target_id=target, subclass='nucleotide_to_protein_match', length=fmax - fmin) current_match.locate_on(target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand) elif cols[2] == 'match_part': parent_id = gff.column_9_value(cols[8], 'Parent').replace('"', '') match_part = things.MatchPart(id=feature_id, parent=parent_id, length=fmax - fmin) match_part.locate_on(target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand) current_match.add_part(match_part) print("INFO: Parsed {0} protein alignment chains".format(aat_match_count)) print("INFO: Comparing organism1's mRNAs with AAT match coordinates") for assembly_id in assemblies: if assembly_id not in aat_matches: continue assembly = assemblies[assembly_id] for gene in assembly.genes(): for mRNA in gene.mRNAs(): if debugging_transcript is not None: if mRNA.id == debugging_transcript: print("DEBUG: processing debugging transcript: {0}". format(mRNA.id)) else: continue for aat_match in aat_matches[assembly_id]: #print("DEBUG: about to call overlap_size_with {0} and {1}, which has {2} segments".format(mRNA.id, aat_match.id, len(aat_match.parts)) ) overlap_size = mRNA.overlap_size_with(aat_match) if overlap_size is not None: #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4})".format(mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length) ) # this shouldn't be possible, but check just in case if overlap_size > mRNA.length: raise Exception( "ERROR: overlap size ({0}) > mRNA length ({1})" .format(overlap_size, mRNA.length)) if aat_match.target_id not in aat_seqs: raise Exception( "ERROR: Found match with target ID ({0}) but didn't find a FASTA entry for it via -aatdb" .format(aat_match.target_id)) # this is a protein length, so x3 match_target_length = len( aat_seqs[aat_match.target_id]['s']) * 3 (mRNA_percent_coverage, target_percent_coverage ) = calculate_fragmented_coverage( mRNA, aat_match, match_target_length) #print("DEBUG: mRNA_percent_coverage:{0}".format(mRNA_percent_coverage) ) #print("DEBUG: match_percent_coverage:{0}".format(target_percent_coverage) ) if mRNA_percent_coverage >= args.aat_percent_coverage_cutoff and target_percent_coverage >= args.aat_percent_coverage_cutoff: o1_with_aat.append(mRNA.id) #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4}), match target id:{5}, length:{6}".format( \ # mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length, \ # aat_match.target_id, match_target_length) ) #print("\tmRNA % cov: {0}".format(mRNA_percent_coverage)) #print("\ttarget % cov: {0}".format(target_percent_coverage)) break # only need to see if one matched print( "INFO: Found {0} mRNAs in org1 with overlapping fungi AAT coordinates". format(len(o1_with_aat))) # key=org1_transcript_id, value=org2_transcript_id top_blast_hits = dict() print("INFO: parsing BLAST results vs. org2") for line in open(args.organism1_blast_alignments): cols = line.split("\t") if float(cols[19]) > args.blast_eval_cutoff: continue if float(cols[10]) < args.blast_percent_identity_cutoff: continue # if we survived until here, this one's good. top_blast_hits[cols[0]] = cols[5] print( "INFO: Comparing overlap between AAT-matched proteins and BLAST ones") for o1_mRNA_id in o1_with_aat: if o1_mRNA_id in top_blast_hits: o1_with_o2.append(o1_mRNA_id) print( "INFO: Found {0} mRNAs in org1 with overlapping AAT coordinates and BLAST hit to org2" .format(len(o1_with_o2))) id_list_fh = open(args.output_id_list, 'wt') for mRNA_id in o1_with_o2: id_list_fh.write("{0}\n".format(mRNA_id))
def main(): parser = argparse.ArgumentParser( description='Converts glimmerHMM GFF output to GFF3') # output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to parse' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) args = parser.parse_args() fout = open(args.output_file, 'w') current_gene = None current_mRNA = None next_exon_num = defaultdict(int) for line in open(args.input_file, 'r'): if line.startswith('#'): fout.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_fmin = int(cols[3]) - 1 feat_fmax = int(cols[4]) id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') if feat_type == 'mRNA': gene_cols = list(cols) gene_cols[2] = 'gene' cols[8] = gff.set_column_9_value(cols[8], 'ID', "{0}.mRNA".format(id)) cols[8] = gff.set_column_9_value(cols[8], 'Name', "{0}.mRNA".format(id)) cols[8] = gff.order_column_9(cols[8]) # print the gene and mRNA fout.write( "{0}\n".format("\t".join(gene_cols)) ) fout.write( "{0}\n".format("\t".join(cols)) ) elif feat_type == 'CDS': exon_cols = list(cols) cols[8] = gff.set_column_9_value(cols[8], 'ID', "{0}.cds".format(parent)) cols[8] = gff.set_column_9_value(cols[8], 'Name', "{0}.cds".format(parent)) cols[8] = gff.set_column_9_value(cols[8], 'Parent', "{0}.mRNA".format(parent)) cols[8] = gff.order_column_9(cols[8]) exon_id = "{0}.exon.{1}".format(parent, next_exon_num[parent] ) next_exon_num[parent] += 1 exon_cols[2] = 'exon' exon_cols[7] = '.' exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'ID', exon_id) exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'Name', exon_id) exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'Parent', "{0}.mRNA".format(parent)) exon_cols[8] = gff.order_column_9(exon_cols[8]) fout.write( "{0}\n".format("\t".join(exon_cols)) ) fout.write( "{0}\n".format("\t".join(cols)) )
def main(): parser = argparse.ArgumentParser( description='Convert GFF output from Prodigal into GFF3 format') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Prodigal') parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created') args = parser.parse_args() assemblies = dict() current_assembly = None gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() ## Used for tracking the exon count for each gene (for ID purposes) exon_count_by_mRNA = dict() fout = open(args.output, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") for line in open(args.input): if line.startswith("#"): pass else: ## gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() ## cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_id = gff.column_9_value(cols[8], 'ID') ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] if feat_type == "CDS": # gene gene = things.Gene(id=feat_id) gene.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) # mRNA mRNA = things.mRNA(id=feat_id + '.t1', parent=gene) mRNA.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) gene.add_mRNA(mRNA) mRNAs[mRNA.id] = mRNA if feat_id in exon_count_by_mRNA: raise Exception( "ERROR: two different mRNAs found with same ID: {0}". format(feat_id)) else: exon_count_by_mRNA[feat_id + '.t1'] = 0 # CDS / exons parent_id = gff.column_9_value(cols[8], 'ID') + '.t1' ## sanity check that we've seen this parent if parent_id not in mRNAs: raise Exception( "ERROR: Found CDS column with parent ({0}) mRNA not yet in the file" .format(parent_id)) CDS = things.CDS(id=parent_id + '.cds', parent=mRNAs[parent_id]) CDS.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7])) mRNA.add_CDS(CDS) # exons weren't explicitly defined in the input file, so we need to derive new IDs for them exon_count_by_mRNA[parent_id] += 1 exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id]) exon = things.Exon(id=exon_id, parent=mRNAs[parent_id]) exon.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) mRNA.add_exon(exon) ## gene.print_as(fh=fout, source='Prodigal_v2.6.3', format='gff3')
def main(): flawed_gff_file = 'canonical.flawed.gff3' ilri_gff = 'Theileria-all-Theileria1_ourids.gff' source = 'GenBank' out_gff = 'canonical.corrected.gff3' fout = open(out_gff, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") (assemblies, features) = gff.get_gff3_features(flawed_gff_file) print("INFO: loaded {0} assemblies and {1} features".format( len(assemblies), len(features))) polypeptides = dict() for line in open(ilri_gff): cols = line.split("\t") if len(cols) != 9 or cols[2] != 'polypeptide': continue id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') polypeptides[parent] = things.Polypeptide(id=id, parent=parent) polypeptides[parent].locate_on(target=assemblies[cols[0]], fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) print("DEBUG: loaded {0} polypeptides from ILRI file".format( len(polypeptides))) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): if mRNA.id not in polypeptides: print( "DEBUG: {0} not found as a parent to any polypeptide". format(mRNA.id)) else: polypeptide = polypeptides[mRNA.id] # pull this outside of the iteration since iterating might delete some CDSs = mRNA.CDSs() for CDS in CDSs: keep = True if CDS < polypeptide: mRNA.delete_CDS(CDS) elif CDS <= polypeptide: CDS.location().fmin = polypeptide.location().fmin if CDS > polypeptide: mRNA.delete_CDS(CDS) elif CDS >= polypeptide: CDS.location().fmax = polypeptide.location().fmax #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \ # CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \ # polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax)) gene.print_as(fh=fout, source=source, format='gff3')
def main(): parser = argparse.ArgumentParser( 'Filter the genes of a GFF3 file by mRNA child IDs') ## output file to be written parser.add_argument('-i', '--input_gff3', type=str, required=True, help='GFF3 file of source molecules') parser.add_argument('-l', '--id_list', type=str, required=True, help='List file of mRNA IDs to keep') parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)') args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') ids_to_keep = list() for line in open(args.id_list): line = line.rstrip() if len(line) > 2: ids_to_keep.append(line) fout.write("##gff-version 3\n") current_gene_lines = list() current_gene_id = None keep = False for line in open(args.input_gff3): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: continue # grab the ID and Parent columns if any id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') type = cols[2] if type == 'gene': # purge the current gene, if any if len(current_gene_lines) > 1: for li in current_gene_lines: fout.write("{0}\n".format(li)) # reset current_gene_lines = list() current_gene_lines.append(line) current_gene_id = id else: if type == 'mRNA': if id in ids_to_keep: keep = True else: keep = False if keep == True: current_gene_lines.append(line)
def main(): parser = argparse.ArgumentParser( description='Adds locus tag identifiers to GFF3 features') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='TA file of source molecules' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' ) parser.add_argument('-p', '--prefix', type=str, required=True, help='The prefix portion of IDs to be generated') parser.add_argument('-a', '--padding', type=int, required=True, help='Specify the minimum with to reserve for the numeric portion of the IDs. Smaller numbers will be zero-padded.' ) parser.add_argument('-n', '--interval', type=int, required=False, default=1, help='Interval between generated identifiers' ) parser.add_argument('-s', '--starting_id', type=int, required=False, default=0, help='Initial numeric portion of IDs to be generated (do not zero-pad)' ) parser.add_argument('-d', '--id_file', type=str, required=False, help='Pass a 2-column file of IDs to retain (in case you have mapped genes, for example)') parser.add_argument('-m', '--molecule_map', type=str, required=False, help='Pass a 2-column file of molecule->token identifiers (see documentation)') parser.add_argument('-c', '--custom', type=str, required=False, help='For custom parsing steps. Most should ignore this.') args = parser.parse_args() check_arguments(args) # used to store locus_tags associated with each gene (so children can inherit) gene_loci = dict() next_id = args.starting_id last_molecule = None id_mapping = parse_mapping_file( args.id_file ) mol_mapping = parse_mapping_file( args.molecule_map ) loci_assigned = list() ## if using Joana's custom options, check assumptions if args.custom == 'joana': if args.molecule_map is None or args.id_file is None: raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=joana") else: ## need to process the ID map to reformat IDs for id in id_mapping: # TP05_0002 -> TpMuguga_05g00002 m = re.match('TP(\d\d)_(\d+)', id_mapping[id]) if m: id_mapping[id] = "{0}_{1}g0{2}".format(args.prefix, m.group(1), m.group(2) ) elif args.custom == 'bmicroti': microti_map = { 'I':'01', 'II':'02', 'III':'03', 'IV':'04' } if args.molecule_map is None or args.id_file is None: raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=bmicroti") else: for id in id_mapping: m = re.match('BBM_(\D+)(\d+)', id_mapping[id]) if m: print("Changing id from {0} to ".format(id)) id_mapping[id] = "{0}_{1}g{2}".format(args.prefix, microti_map[m.group(1)], m.group(2) ) print(id_mapping[id]) else: raise Exception("ERROR: id ({0}) didn't match expected convention.".format(id_mapping[id])) ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') last_number_portion_assigned = 0 for line in open(args.input_file): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: fout.write(line + "\n") continue if last_molecule is None or (args.molecule_map is not None and mol_mapping[cols[0]] != mol_mapping[last_molecule]): print("Found molecule {0}, resetting id counter from {1}".format(cols[0], next_id) ) next_id = args.starting_id last_molecule = cols[0] # grab the ID column if any id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') type = cols[2] if type == 'gene': while True: if id in id_mapping: locus_id = id_mapping[id] else: if args.molecule_map is None: locus_id = "{0}_{1}".format(args.prefix, str(next_id).zfill(args.padding)) else: if cols[0] in mol_mapping: if args.custom == 'bmicroti': locus_id = "{0}_{2}g{1}".format(args.prefix, str(int(last_number_portion_assigned) + 1).zfill(args.padding), mol_mapping[cols[0]]) else: locus_id = "{0}_{2}g{1}".format(args.prefix, str(next_id).zfill(args.padding), mol_mapping[cols[0]]) else: raise Exception("ERROR: --molecule_map passed but {0} wasn't found in it.".format(cols[0]) ) next_id += args.interval cols[8] = gff.set_column_9_value(cols[8], 'locus_tag', locus_id) ## make sure this wasn't generated already (possibly conflict between --id_file and an # auto-generated ID? if locus_id not in loci_assigned: break else: print("DEBUG: Duplicate ID assigned ({0}), trying again.".format(locus_id) ) loci_assigned.append(locus_id) gene_loci[id] = locus_id m = re.search(r"(\d+)$", locus_id) if m: last_number_portion_assigned = m.group(1) elif type.endswith('RNA'): if parent in gene_loci: cols[8] = gff.set_column_9_value(cols[8], 'locus_tag', gene_loci[parent]) else: raise Exception("ERROR: found RNA {0} whose parent {1} wasn't found yet".format(id, parent)) fout.write("\t".join(cols) + "\n")
def main(): parser = argparse.ArgumentParser( description='A GTF -> GFF3 conversion script for StringTie output') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GTF file' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' ) args = parser.parse_args() ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None current_match = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) for line in open(args.input_file, "r"): cols = line.split("\t") if len(cols) != 9: print("SKIPPING: {0}".format(line)) continue mol_id = cols[0] if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] ftype = cols[2] fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] col9 = cols[8] # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key) col9 = col9.replace(' "', '="') gene_id = gff.column_9_value(col9, 'gene_id').replace('"', '') transcript_id = gff.column_9_value(col9, 'transcript_id').replace('"', '') cov = gff.column_9_value(col9, 'cov').replace('"', '') if ftype == 'transcript': if current_gene is not None and current_gene.id != gene_id: gene.print_as(fh=ofh, source='StringTie', format='gff3') if current_gene is None or current_gene.id != gene_id: gene = things.Gene(id=gene_id) gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_gene = gene mRNA = things.mRNA(id=transcript_id, parent=current_gene) mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_mRNA(mRNA) current_RNA = mRNA exon_count_by_RNA[transcript_id] = 0 current_CDS_phase = 0 elif ftype == 'exon': exon_number = gff.column_9_value(col9, 'exon_number').replace('"', '') exon_count_by_RNA[transcript_id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) CDS = things.CDS(id=cds_id, parent=current_RNA) CDS.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand, phase=current_CDS_phase ) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) current_CDS_phase = 3 - (((fmax - fmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) exon = things.Exon(id=exon_id, parent=current_RNA) exon.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_RNA.add_exon(exon) # don't forget to do the last gene, if there were any if current_gene is not None: gene.print_as(fh=ofh, source='StringTie', format='gff3')
def main(): parser = argparse.ArgumentParser( description='Converts glimmerHMM GFF output to GFF3') # output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to parse') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') args = parser.parse_args() fout = open(args.output_file, 'w') current_gene = None current_mRNA = None next_exon_num = defaultdict(int) for line in open(args.input_file, 'r'): if line.startswith('#'): fout.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_fmin = int(cols[3]) - 1 feat_fmax = int(cols[4]) id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') if feat_type == 'mRNA': gene_cols = list(cols) gene_cols[2] = 'gene' cols[8] = gff.set_column_9_value(cols[8], 'ID', "{0}.mRNA".format(id)) cols[8] = gff.set_column_9_value(cols[8], 'Name', "{0}.mRNA".format(id)) cols[8] = gff.order_column_9(cols[8]) # print the gene and mRNA fout.write("{0}\n".format("\t".join(gene_cols))) fout.write("{0}\n".format("\t".join(cols))) elif feat_type == 'CDS': exon_cols = list(cols) cols[8] = gff.set_column_9_value(cols[8], 'ID', "{0}.cds".format(parent)) cols[8] = gff.set_column_9_value(cols[8], 'Name', "{0}.cds".format(parent)) cols[8] = gff.set_column_9_value(cols[8], 'Parent', "{0}.mRNA".format(parent)) cols[8] = gff.order_column_9(cols[8]) exon_id = "{0}.exon.{1}".format(parent, next_exon_num[parent]) next_exon_num[parent] += 1 exon_cols[2] = 'exon' exon_cols[7] = '.' exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'ID', exon_id) exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'Name', exon_id) exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'Parent', "{0}.mRNA".format(parent)) exon_cols[8] = gff.order_column_9(exon_cols[8]) fout.write("{0}\n".format("\t".join(exon_cols))) fout.write("{0}\n".format("\t".join(cols)))
def main(): parser = argparse.ArgumentParser( description='Removes orphaned features in a GFF3 file') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' ) parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' ) #parser.add_argument('-t', '--type', type=str, required=False, help='Type of features to remove' ) args = parser.parse_args() # going to try saving memory by tracking line numbers instead of storing all of it # true means keep the line, false means to omit it # doing tracking this way since it's technically legal for a feature to have no identifier at all. lines = list() parents = dict() current_line_num = -1 infile = open(args.input) for line in infile: current_line_num += 1 if line.startswith('#'): lines.append(True) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: lines.append(True) continue id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') if parent is None: # this might be overwritten later lines.append(False) if id is not None: if parent not in parents: parents[parent] = False else: lines.append(True) parents[parent] = True infile.seek(0) current_line_num = -1 outfh = open(args.output, 'wt') for line in infile: current_line_num += 1 if lines[current_line_num] == True: outfh.write(line) else: line = line.rstrip() cols = line.split("\t") if len(cols) == 9: id = gff.column_9_value(cols[8], 'ID') if id is not None and id in parents and parents[id] == True: outfh.write("{0}\n".format(line)) else: print("WARN: removing this line: {0}".format(line))
def main(): parser = argparse.ArgumentParser( description='Adds locus tag identifiers to GFF3 features') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='TA file of source molecules') parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)') parser.add_argument('-p', '--prefix', type=str, required=True, help='The prefix portion of IDs to be generated') parser.add_argument( '-a', '--padding', type=int, required=True, help= 'Specify the minimum with to reserve for the numeric portion of the IDs. Smaller numbers will be zero-padded.' ) parser.add_argument('-n', '--interval', type=int, required=False, default=1, help='Interval between generated identifiers') parser.add_argument( '-s', '--starting_id', type=int, required=False, default=0, help='Initial numeric portion of IDs to be generated (do not zero-pad)' ) parser.add_argument( '-d', '--id_file', type=str, required=False, help= 'Pass a 2-column file of IDs to retain (in case you have mapped genes, for example)' ) parser.add_argument( '-m', '--molecule_map', type=str, required=False, help= 'Pass a 2-column file of molecule->token identifiers (see documentation)' ) parser.add_argument( '-c', '--custom', type=str, required=False, help='For custom parsing steps. Most should ignore this.') args = parser.parse_args() check_arguments(args) # used to store locus_tags associated with each gene (so children can inherit) gene_loci = dict() next_id = args.starting_id last_molecule = None id_mapping = parse_mapping_file(args.id_file) mol_mapping = parse_mapping_file(args.molecule_map) loci_assigned = list() ## if using Joana's custom options, check assumptions if args.custom == 'joana': if args.molecule_map is None or args.id_file is None: raise Exception( "ERROR: Expected --molecule_map and --id_file options when using --custom=joana" ) else: ## need to process the ID map to reformat IDs for id in id_mapping: # TP05_0002 -> TpMuguga_05g00002 m = re.match('TP(\d\d)_(\d+)', id_mapping[id]) if m: id_mapping[id] = "{0}_{1}g0{2}".format( args.prefix, m.group(1), m.group(2)) elif args.custom == 'bmicroti': microti_map = {'I': '01', 'II': '02', 'III': '03', 'IV': '04'} if args.molecule_map is None or args.id_file is None: raise Exception( "ERROR: Expected --molecule_map and --id_file options when using --custom=bmicroti" ) else: for id in id_mapping: m = re.match('BBM_(\D+)(\d+)', id_mapping[id]) if m: print("Changing id from {0} to ".format(id)) id_mapping[id] = "{0}_{1}g{2}".format( args.prefix, microti_map[m.group(1)], m.group(2)) print(id_mapping[id]) else: raise Exception( "ERROR: id ({0}) didn't match expected convention.". format(id_mapping[id])) ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') last_number_portion_assigned = 0 for line in open(args.input_file): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: fout.write(line + "\n") continue if last_molecule is None or ( args.molecule_map is not None and mol_mapping[cols[0]] != mol_mapping[last_molecule]): print("Found molecule {0}, resetting id counter from {1}".format( cols[0], next_id)) next_id = args.starting_id last_molecule = cols[0] # grab the ID column if any id = gff.column_9_value(cols[8], 'ID') parent = gff.column_9_value(cols[8], 'Parent') type = cols[2] # issue # 66F4EEF2E3C863C251F831817FF71233 # 7F1917E4D81A959078C9A38E15488BC0 # E22888670919A4A888572155F40F2654 # B9D9CF1F7A8E5A2E1124F0A6C68840DC -> BBM_I00232 # gene before is: 6DE6BCCE69CCDC39994A0940B2ED524A - novel # errors on: BmicrotiR1_01g00233 -> BBM_I00233 #5800A4110A62E4EAE57AFAD1F8D65CB3 BBM_I00233 if type == 'gene': while True: if id in id_mapping: locus_id = id_mapping[id] else: if args.molecule_map is None: locus_id = "{0}_{1}".format( args.prefix, str(next_id).zfill(args.padding)) else: if cols[0] in mol_mapping: if args.custom == 'bmicroti': locus_id = "{0}_{2}g{1}".format( args.prefix, str(int(last_number_portion_assigned) + 1).zfill(args.padding), mol_mapping[cols[0]]) else: locus_id = "{0}_{2}g{1}".format( args.prefix, str(next_id).zfill(args.padding), mol_mapping[cols[0]]) else: raise Exception( "ERROR: --molecule_map passed but {0} wasn't found in it." .format(cols[0])) next_id += args.interval cols[8] = gff.set_column_9_value(cols[8], 'locus_tag', locus_id) ## make sure this wasn't generated already (possibly conflict between --id_file and an # auto-generated ID? if locus_id not in loci_assigned: break else: print("DEBUG: Duplicate ID assigned ({0}), trying again.". format(locus_id)) loci_assigned.append(locus_id) gene_loci[id] = locus_id m = re.search(r"(\d+)$", locus_id) if m: last_number_portion_assigned = m.group(1) elif type.endswith('RNA'): if parent in gene_loci: cols[8] = gff.set_column_9_value(cols[8], 'locus_tag', gene_loci[parent]) else: raise Exception( "ERROR: found RNA {0} whose parent {1} wasn't found yet". format(id, parent)) fout.write("\t".join(cols) + "\n")