def get_loci(transcripts_genepred): loci = Loci() loci.verbose = True with open(transcripts_genepred) as inf: for line in inf: if line[0] == '#': continue gpd = GenePredEntry(line.rstrip()) rng = Bed(gpd.value('chrom'), gpd.value('txStart'), gpd.value('txEnd')) rng.set_payload(gpd.value('name')) loc1 = Locus() loc1.add_member(rng) loci.add_locus(loc1) sys.stderr.write("Organizing genepred data into overlapping loci\n") sys.stderr.write("Started with " + str(len(loci.loci)) + " loci\n") loci.update_loci() sys.stderr.write("Ended with " + str(len(loci.loci)) + " loci\n") m = 0 locus2name = {} name2locus = {} for locus in loci.loci: m += 1 for member in locus.members: name = member.get_payload() if m not in locus2name: locus2name[m] = set() locus2name[m].add(name) name2locus[name] = m return [locus2name, name2locus]
def get_loci(transcripts_genepred): loci = Loci() loci.verbose= True with open(transcripts_genepred) as inf: for line in inf: if line[0]=='#': continue gpd = GenePredEntry(line.rstrip()) rng = Bed(gpd.value('chrom'),gpd.value('txStart'),gpd.value('txEnd')) rng.set_payload(gpd.value('name')) loc1 = Locus() loc1.add_member(rng) loci.add_locus(loc1) sys.stderr.write("Organizing genepred data into overlapping loci\n") sys.stderr.write("Started with "+str(len(loci.loci))+" loci\n") loci.update_loci() sys.stderr.write("Ended with "+str(len(loci.loci))+" loci\n") m = 0 locus2name = {} name2locus = {} for locus in loci.loci: m+=1 for member in locus.members: name = member.get_payload() if m not in locus2name: locus2name[m] = set() locus2name[m].add(name) name2locus[name] = m return [locus2name,name2locus]
def process_read(mpa, args): # Filter entries by a minimum alignment coverage newentries = [] for i in [ i for i in range(0, len(mpa.entries)) if mpa.entries[i].get_coverage() > args.minimum_alignment_coverage ]: newentries.append(mpa.entries[i]) mpa.entries = newentries # Find best singles bestsingle = None bestsinglescore = -1 for i in range(0, len(mpa.entries)): totalcov = mpa.entries[i].get_coverage() weightedcov = float(mpa.entries[i].get_coverage()) * float( mpa.entries[i].get_quality()) if weightedcov > bestsinglescore: bestsinglescore = weightedcov bestsingle = i if bestsinglescore == -1: sys.stderr.write("failed to find a single path\n") return None my_max_intron = args.maximum_intron if args.fusion: my_max_intron = -1 # we can look any distance for a group mpa.compatible_graph( max_intron=my_max_intron, max_query_overlap=args.maximum_query_overlap, max_gap=args.maximum_query_gap, max_target_overlap=args.maximum_target_overlap, max_query_fraction_overlap=args.maximum_query_fraction_overlap) ps = mpa.get_root_paths() bestpath = [bestsingle] bestscore = 0 besttotalcov = 0 allscores = [] allcov = [] best_path_index = -1 zz = 0 for path in ps: totalcov = sum([mpa.entries[i].get_coverage() for i in path]) weightedcov = sum([ float(mpa.entries[i].get_coverage()) * float(mpa.entries[i].get_quality()) for i in path ]) allscores.append(weightedcov) allcov.append(totalcov) if weightedcov > bestscore: bestscore = weightedcov bestpath = path besttotalcov = totalcov best_path_index = zz zz += 1 #if not bestpath: return None otherpaths = [] for i in range(0, len(ps)): if i != best_path_index: otherpaths.append(ps[i]) query_target_coverages = [] for other_path in otherpaths: qcov = 0 tcov = 0 for other_entry in [mpa.entries[i] for i in other_path]: for entry in [mpa.entries[j] for j in bestpath]: qcov += other_entry.query_overlap_size(entry) tcov += other_entry.target_overlap_size(entry) query_target_coverages.append(str(qcov) + '/' + str(tcov)) gapsizes = [] if len(bestpath) > 1: gapsizes = [ mpa.entries[bestpath[j + 1]].get_query_bed().start - mpa.entries[bestpath[j]].get_query_bed().end - 1 for j in range(0, len(bestpath) - 1) ] #print mpa.g.get_status_string() #print [mpa.entries[i].get_target_bed().get_range_string() for i in bestpath] #print [mpa.entries[i].get_query_bed().get_range_string() for i in bestpath] #print [mpa.entries[i].get_quality() for i in bestpath] #print [mpa.entries[i].get_coverage() for i in bestpath] #print gapsizes #print bestscore #print bestsinglescore #See if we should use the single path score instead if len(path) > 1 and bestsinglescore * ( 1 + args.multipath_score_improvement) > bestscore: bestpath = [bestsingle] besttotalcov = mpa.entries[bestsingle].get_coverage() bestscore = bestsinglescore query_span = mpa.entries[bestpath[0]].get_query_bed() loci = Loci() loci.set_use_direction(True) loci.set_minimum_distance(args.maximum_intron) for i in bestpath: r = mpa.entries[i].get_target_bed() locus = Locus() locus.set_use_direction(True) locus.add_member(r) loci.add_locus(locus) loci.update_loci() if len(bestpath) > 1: for i in bestpath[1:]: query_span = mpa.entries[i].get_query_bed().merge(query_span) report = '' report += mpa.entries[bestpath[0]].value('qName') + "\t" report += str(len(bestpath)) + "\t" report += str(len(loci.loci)) + "\t" report += query_span.get_range_string() + "\t" report += ','.join([mpa.entries[i].value('strand') for i in bestpath]) + "\t" report += ','.join( [mpa.entries[i].get_query_bed().get_range_string() for i in bestpath]) + "\t" report += ','.join( [mpa.entries[i].get_target_bed().get_range_string() for i in bestpath]) + "\t" report += ','.join([str(mpa.entries[i].get_quality()) for i in bestpath]) + "\t" report += ','.join([str(mpa.entries[i].get_coverage()) for i in bestpath]) + "\t" report += ','.join([str(x) for x in gapsizes]) + "\t" report += str(besttotalcov) + "\t" report += str(bestscore) + "\t" report += str(bestsinglescore) + "\t" report += str(','.join(query_target_coverages) + "\t") #if args.best_report: # best_report_fh.write(report+"\n") #for i in bestpath: # args.output.write(mpa.entries[i].get_line()+"\n") return [report, [mpa.entries[i].get_line() for i in bestpath]]
def main(): parser = argparse.ArgumentParser( description= "Rename gene and transcript elements of GenePred file that are redundant. Please specify an output if you would like report files generated for the filters." ) parser.add_argument('input', help="GENEPREDFILE or '-' for STDIN") parser.add_argument( '-o', '--output', help= "OUTPUT FILE default is STDOUT, but you need to specify an output file to get report files generated" ) parser.add_argument( '--minimum_locus_distance', type=int, default=500000, help="Genes with the same name will be renamed if this far apart") parser.add_argument( '--keep_positional_duplicates', action='store_true', help="By default we remove one of the duplicate entries") parser.add_argument( '--keep_transcript_names', action='store_true', help="By default we rename duplicated transcript names") parser.add_argument( '--keep_gene_names', action='store_true', help="By default we rename genes located at different loci.") args = parser.parse_args() inf = sys.stdin if args.input != '-': inf = open(args.input) of = sys.stdout if args.output: of = open(args.output, 'w') txdef = {} gfams = {} for line in inf: if line[0] == '#': continue g = GenePredEntry(line) loc = g.value('chrom') + ':' + ','.join( [str(x) for x in g.value('exonStarts')]) + '-' + ','.join( [str(x) for x in g.value('exonEnds')]) + '/' + g.value('strand') if loc not in txdef: txdef[loc] = [] txdef[loc].append(g) if g.value('gene_name') not in gfams: gfams[g.value('gene_name')] = [] gfams[g.value('gene_name')].append(g.value('name')) # now we have cataloged all transcripts by unique locations omissions = [] keepers = [] for loc in sorted(txdef.keys()): if args.keep_positional_duplicates: # We don't want to ommit anything here for g in txdef[loc]: keepers.append(g) continue #basically skipping this part by populating keepers num = len(txdef[loc]) if num > 1: sys.stderr.write("Found " + str(num) + " entries at location\n") sys.stderr.write(loc + "\n") sys.stderr.write("They are:\n") largest = 0 keepgene = None keepindex = -1 i = 0 for e in txdef[loc]: famsize = len(gfams[e.value('gene_name')]) sys.stderr.write(" " + e.value('gene_name') + "\t" + e.value('name') + "\t" + str(famsize) + "\n") if famsize > largest: keepgene = e largest = famsize keepindex = i i += 1 for j in range(0, len(txdef[loc])): if j != keepindex: omissions.append(txdef[loc][j]) else: keepers.append(txdef[loc][j]) sys.stderr.write(" Biggest gene family is " + keepgene.value('gene_name') + " with " + str(largest) + " transcripts\n") sys.stderr.write(" so keep that one.\n") else: keepers.append(txdef[loc][0]) sys.stderr.write("Omitting " + str(len(omissions)) + " entries for redundant positions\n") if args.output and not args.keep_positional_duplicates: of1 = open(args.output + '.positional_duplicate_omissions', 'w') for g in omissions: of1.write(g.get_line() + "\n") of1.close() # Now the keepers contains transcripts with unique locations # Lets provide unique names to remaining transcripts tnames = {} renametx = {} for g in keepers: tx = g.value('name') if tx not in tnames: tnames[tx] = [] tnames[tx].append(g) for name in tnames: if args.keep_transcript_names: continue # We don't want to rename them nsize = len(tnames[name]) if nsize > 1: sys.stderr.write("Name: " + name + " has a family of size " + str(nsize) + "\n") for i in range(0, len(tnames[name])): newname = name + '[' + str(i + 1) + '/' + str(nsize) + ']' renametx[newname] = name tnames[name][i].entry['name'] = newname sys.stderr.write("Renamed: " + str(len(renametx)) + " transcripts\n") if args.output and not args.keep_transcript_names: of1 = open(args.output + '.renamed_transcripts', 'w') for name in sorted(renametx.keys()): of1.write(name + "\t" + renametx[name] + "\n") of1.close() #now we need to arrange into gene families gnames = {} for name in tnames: for g in tnames[name]: gene = g.value('gene_name') if gene not in gnames: gnames[gene] = [] gnames[gene].append(g) renamegene = {} finished = [] for gene in gnames: if args.keep_gene_names: for g in gnames[gene]: finished.append(g) continue # We don't want to rename genes if len(gnames[gene]) == 1: finished.append(gnames[gene][0]) continue # Now we need to make sure these genes are really on the same locus. loci = Loci() loci.set_minimum_distance(args.minimum_locus_distance) for g in gnames[gene]: r = g.locus_range.copy() r.set_payload(g) loc = Locus() loc.add_member(r) loci.add_locus(loc) loci.update_loci() lcount = len(loci.loci) if lcount == 1: for g in gnames[gene]: finished.append(g) continue # need to rename some genes for i in range(0, lcount): newname = gene + '[' + str(i + 1) + '/' + str(lcount) + ']' rstr = loci.loci[i].range.get_range_string() renamegene[newname] = gene sys.stderr.write(newname + "\t" + rstr + "\n") for m in loci.loci[i].members: m.get_payload().entry['gene_name'] = newname finished.append(m.get_payload()) sys.stderr.write("Renamed: " + str(len(renamegene)) + " genes\n") if args.output and not args.keep_transcript_names: of1 = open(args.output + '.renamed_genes', 'w') for name in sorted(renamegene.keys()): of1.write(name + "\t" + renamegene[name] + "\n") of1.close() #Now lets resort by genes bygene = {} for g in finished: gene = g.value('gene_name') if gene not in bygene: bygene[gene] = [] bygene[gene].append(g) for gene in sorted(bygene.keys()): for g in bygene[gene]: of.write(g.get_line() + "\n") of.close() inf.close()
def process_read(mpa,args): # Filter entries by a minimum alignment coverage newentries = [] for i in [i for i in range(0,len(mpa.entries)) if mpa.entries[i].get_coverage() > args.minimum_alignment_coverage]: newentries.append(mpa.entries[i]) mpa.entries = newentries # Find best singles bestsingle = None bestsinglescore = -1 for i in range(0,len(mpa.entries)): totalcov = mpa.entries[i].get_coverage() weightedcov = float(mpa.entries[i].get_coverage())*float(mpa.entries[i].get_quality()) if weightedcov > bestsinglescore: bestsinglescore = weightedcov bestsingle = i if bestsinglescore == -1: sys.stderr.write("failed to find a single path\n") return None my_max_intron = args.maximum_intron if args.fusion: my_max_intron = -1 # we can look any distance for a group mpa.compatible_graph(max_intron=my_max_intron,max_query_overlap=args.maximum_query_overlap,max_gap=args.maximum_query_gap,max_target_overlap=args.maximum_target_overlap,max_query_fraction_overlap=args.maximum_query_fraction_overlap) ps = mpa.get_root_paths() bestpath = [bestsingle] bestscore = 0 besttotalcov = 0 allscores = [] allcov = [] best_path_index = -1 zz = 0 for path in ps: totalcov = sum([mpa.entries[i].get_coverage() for i in path]) weightedcov = sum([float(mpa.entries[i].get_coverage())*float(mpa.entries[i].get_quality()) for i in path]) allscores.append(weightedcov) allcov.append(totalcov) if weightedcov > bestscore: bestscore = weightedcov bestpath = path besttotalcov = totalcov best_path_index = zz zz+=1 #if not bestpath: return None otherpaths = [] for i in range(0,len(ps)): if i != best_path_index: otherpaths.append(ps[i]) query_target_coverages = [] for other_path in otherpaths: qcov = 0 tcov = 0 for other_entry in [mpa.entries[i] for i in other_path]: for entry in [mpa.entries[j] for j in bestpath]: qcov += other_entry.query_overlap_size(entry) tcov += other_entry.target_overlap_size(entry) query_target_coverages.append(str(qcov)+'/'+str(tcov)) gapsizes = [] if len(bestpath) > 1: gapsizes = [mpa.entries[bestpath[j+1]].get_query_bed().start - mpa.entries[bestpath[j]].get_query_bed().end -1 for j in range(0,len(bestpath)-1)] #print mpa.g.get_status_string() #print [mpa.entries[i].get_target_bed().get_range_string() for i in bestpath] #print [mpa.entries[i].get_query_bed().get_range_string() for i in bestpath] #print [mpa.entries[i].get_quality() for i in bestpath] #print [mpa.entries[i].get_coverage() for i in bestpath] #print gapsizes #print bestscore #print bestsinglescore #See if we should use the single path score instead if len(path) > 1 and bestsinglescore*(1+args.multipath_score_improvement) > bestscore: bestpath = [bestsingle] besttotalcov = mpa.entries[bestsingle].get_coverage() bestscore = bestsinglescore query_span = mpa.entries[bestpath[0]].get_query_bed() loci = Loci() loci.set_use_direction(True) loci.set_minimum_distance(args.maximum_intron) for i in bestpath: r = mpa.entries[i].get_target_bed() locus = Locus() locus.set_use_direction(True) locus.add_member(r) loci.add_locus(locus) loci.update_loci() if len(bestpath) > 1: for i in bestpath[1:]: query_span = mpa.entries[i].get_query_bed().merge(query_span) report = '' report += mpa.entries[bestpath[0]].value('qName')+"\t" report += str(len(bestpath))+"\t" report += str(len(loci.loci))+"\t" report += query_span.get_range_string()+"\t" report += ','.join([mpa.entries[i].value('strand') for i in bestpath])+"\t" report += ','.join([mpa.entries[i].get_query_bed().get_range_string() for i in bestpath])+"\t" report += ','.join([mpa.entries[i].get_target_bed().get_range_string() for i in bestpath])+"\t" report += ','.join([str(mpa.entries[i].get_quality()) for i in bestpath])+"\t" report += ','.join([str(mpa.entries[i].get_coverage()) for i in bestpath])+"\t" report += ','.join([str(x) for x in gapsizes])+"\t" report += str(besttotalcov)+"\t" report += str(bestscore)+"\t" report += str(bestsinglescore)+"\t" report += str(','.join(query_target_coverages)+"\t") #if args.best_report: # best_report_fh.write(report+"\n") #for i in bestpath: # args.output.write(mpa.entries[i].get_line()+"\n") return [report, [mpa.entries[i].get_line() for i in bestpath]]
def main(): parser = argparse.ArgumentParser(description="Rename gene and transcript elements of GenePred file that are redundant. Please specify an output if you would like report files generated for the filters.") parser.add_argument('input',help="GENEPREDFILE or '-' for STDIN") parser.add_argument('-o','--output',help="OUTPUT FILE default is STDOUT, but you need to specify an output file to get report files generated") parser.add_argument('--minimum_locus_distance',type=int,default=500000,help="Genes with the same name will be renamed if this far apart") parser.add_argument('--keep_positional_duplicates',action='store_true',help="By default we remove one of the duplicate entries") parser.add_argument('--keep_transcript_names',action='store_true',help="By default we rename duplicated transcript names") parser.add_argument('--keep_gene_names',action='store_true',help="By default we rename genes located at different loci.") args = parser.parse_args() inf = sys.stdin if args.input != '-': inf = open(args.input) of = sys.stdout if args.output: of = open(args.output,'w') txdef = {} gfams = {} for line in inf: if line[0] == '#': continue g = GenePredEntry(line) loc = g.value('chrom') + ':' +','.join([str(x) for x in g.value('exonStarts')]) + '-' + ','.join([str(x) for x in g.value('exonEnds')])+'/'+g.value('strand') if loc not in txdef: txdef[loc] = [] txdef[loc].append(g) if g.value('gene_name') not in gfams: gfams[g.value('gene_name')] = [] gfams[g.value('gene_name')].append(g.value('name')) # now we have cataloged all transcripts by unique locations omissions = [] keepers = [] for loc in sorted(txdef.keys()): if args.keep_positional_duplicates: # We don't want to ommit anything here for g in txdef[loc]: keepers.append(g) continue #basically skipping this part by populating keepers num = len(txdef[loc]) if num > 1: sys.stderr.write("Found "+str(num)+" entries at location\n") sys.stderr.write(loc +"\n") sys.stderr.write("They are:\n") largest = 0 keepgene = None keepindex = -1 i = 0 for e in txdef[loc]: famsize = len(gfams[e.value('gene_name')]) sys.stderr.write(" "+e.value('gene_name')+"\t"+e.value('name')+"\t"+str(famsize)+"\n") if famsize > largest: keepgene = e largest = famsize keepindex = i i+=1 for j in range(0,len(txdef[loc])): if j != keepindex: omissions.append(txdef[loc][j]) else: keepers.append(txdef[loc][j]) sys.stderr.write(" Biggest gene family is "+keepgene.value('gene_name')+" with "+str(largest)+" transcripts\n") sys.stderr.write(" so keep that one.\n") else: keepers.append(txdef[loc][0]) sys.stderr.write("Omitting "+str(len(omissions))+" entries for redundant positions\n") if args.output and not args.keep_positional_duplicates: of1 = open(args.output+'.positional_duplicate_omissions','w') for g in omissions: of1.write(g.get_line()+"\n") of1.close() # Now the keepers contains transcripts with unique locations # Lets provide unique names to remaining transcripts tnames = {} renametx = {} for g in keepers: tx = g.value('name') if tx not in tnames: tnames[tx] = [] tnames[tx].append(g) for name in tnames: if args.keep_transcript_names: continue # We don't want to rename them nsize = len(tnames[name]) if nsize > 1: sys.stderr.write("Name: "+name+" has a family of size "+str(nsize)+"\n") for i in range(0,len(tnames[name])): newname = name+'['+str(i+1)+'/'+str(nsize)+']' renametx[newname] = name tnames[name][i].entry['name'] = newname sys.stderr.write("Renamed: "+str(len(renametx))+" transcripts\n") if args.output and not args.keep_transcript_names: of1 = open(args.output+'.renamed_transcripts','w') for name in sorted(renametx.keys()): of1.write(name+"\t"+renametx[name]+"\n") of1.close() #now we need to arrange into gene families gnames = {} for name in tnames: for g in tnames[name]: gene = g.value('gene_name') if gene not in gnames: gnames[gene] = [] gnames[gene].append(g) renamegene = {} finished = [] for gene in gnames: if args.keep_gene_names: for g in gnames[gene]: finished.append(g) continue # We don't want to rename genes if len(gnames[gene])==1: finished.append(gnames[gene][0]) continue # Now we need to make sure these genes are really on the same locus. loci = Loci() loci.set_minimum_distance(args.minimum_locus_distance) for g in gnames[gene]: r = g.locus_range.copy() r.set_payload(g) loc = Locus() loc.add_member(r) loci.add_locus(loc) loci.update_loci() lcount = len(loci.loci) if lcount == 1: for g in gnames[gene]: finished.append(g) continue # need to rename some genes for i in range(0,lcount): newname = gene+'['+str(i+1)+'/'+str(lcount)+']' rstr = loci.loci[i].range.get_range_string() renamegene[newname] = gene sys.stderr.write(newname+"\t"+rstr+"\n") for m in loci.loci[i].members: m.get_payload().entry['gene_name'] = newname finished.append(m.get_payload()) sys.stderr.write("Renamed: "+str(len(renamegene))+" genes\n") if args.output and not args.keep_transcript_names: of1 = open(args.output+'.renamed_genes','w') for name in sorted(renamegene.keys()): of1.write(name+"\t"+renamegene[name]+"\n") of1.close() #Now lets resort by genes bygene = {} for g in finished: gene = g.value('gene_name') if gene not in bygene: bygene[gene] = [] bygene[gene].append(g) for gene in sorted(bygene.keys()): for g in bygene[gene]: of.write(g.get_line()+"\n") of.close() inf.close()