def build_order(qbed_file, sbed_file): print(sys.stderr, "Read annotation files %s and %s" % (qbed_file, sbed_file)) qbed = Bed(qbed_file) sbed = Bed(sbed_file) qorder = qbed.get_order() sorder = sbed.get_order() return qbed, sbed, qorder, sorder
def main(gff_file,bed_corrected): gff = Gff(gff_file) bed = Bed(bed_corrected) gffLength = gff.__len__() gff_index = 0 info, before = '','' fp=open(gff_file+'.modified','w') for index in range(gffLength): item = gff.__getitem__(index) info = item.accn if item.accn != 'UNDEF': if item.type == 'gene': gene_bed = bed.__getitem__(gff_index).__str__().split('\t')[3] gene_list = gene_bed.split('|') paralist = item.accn.split(';') print(gene_bed, paralist) if len(gene_list) == 1 and len(paralist)==2: gene1, gene2 = paralist[0].split(',')[0], paralist[1].split(',')[0] if gene_bed == gene1: item.__setattr__("accn", paralist[0]) before = paralist[0] elif gene_bed == gene2: item.__setattr__("accn", paralist[1]) before = paralist[1] else: before = item.accn else : before = item.accn gff_index = gff_index + 1 else: item.__setattr__("accn",before) else: pass fp.write(str(item)+'\n') fp.close()
def write_nolocaldups(bed_path, localdups_file, out_name): bed = Bed(bed_path) children = [] for line in open(localdups_file): dups = DupLine(line) children += dups.children print >> sys.stderr, "write tandem-filtered bed file {0}".format(out_name) fh = open(out_name, "w") for i, row in enumerate(bed): if row['accn'] in children: continue print >> fh, row fh.close()
def main(qfasta, sfasta, options): qfasta = Fasta(qfasta) sfasta = Fasta(sfasta) if not (options.qbed and options.sbed): anchors = PositionAnchor(options.anchors) else: qbed = Bed(options.qbed) sbed = Bed(options.sbed) anchors = Anchor(options.anchors, qbed, sbed) cpus = cpu_count() pool = Pool(cpus) for i, command_group in enumerate( anchors.gen_cmds(qfasta, sfasta, options.dist, options.cmd)): if not (i - 1) % 500: print >> sys.stderr, "complete: %.5f" % (( (i - 1.0) * cpus) / len(anchors)) for lines in pool.map(run_blast, command_group): for line in lines: line[6:10] = map(str, line[6:10]) print "\t".join(line)
def main(blast_file, options): qbed_file, sbed_file = options.qbed, options.sbed # is this a self-self blast? is_self = (qbed_file == sbed_file) if is_self: print >>sys.stderr, "... looks like a self-self BLAST to me" global_density_ratio = options.global_density_ratio tandem_Nmax = options.tandem_Nmax filter_repeats = options.filter_repeats cscore = options.cscore print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file) qbed = Bed(qbed_file) sbed = Bed(sbed_file) qorder = qbed.get_order() sorder = sbed.get_order() fp = file(blast_file) print >>sys.stderr, "read BLAST file %s (total %d lines)" % \ (blast_file, sum(1 for line in fp)) fp.seek(0) fp2 = [] for x in fp: if x[0] == '#': continue fp2.append(x) blasts = sorted([BlastLine(line) for line in fp2], \ key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() ostrip = options.strip_names for b in blasts: query, subject = b.query, b.subject if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder: print >>sys.stderr, "WARNING: %s not in %s" % (query, qbed.filename) continue if subject not in sorder: print >>sys.stderr, "WARNING: %s not in %s" % (subject, sbed.filename) continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # move all hits to same side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q['seqid'], s['seqid'] filtered_blasts.append(b) if global_density_ratio: print >>sys.stderr, "running the global_density filter" + \ "(global_density_ratio=%d)..." % options.global_density_ratio gene_count = len(qorder) + len(sorder) before_filter = len(filtered_blasts) filtered_blasts = filter_to_global_density(filtered_blasts, gene_count, global_density_ratio) print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts)) if tandem_Nmax: print >>sys.stderr, "running the local dups filter (tandem_Nmax=%d)..." % tandem_Nmax qtandems = tandem_grouper(qbed, filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax) standems = tandem_grouper(sbed, filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax) qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w") if is_self: for s in standems: qtandems.join(*s) qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed) sdups_to_mother = qdups_to_mother else: qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed) sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w") sdups_to_mother = write_localdups(sdups_fh, standems, sbed) if options.tandems_only: # just want to use this script as a tandem finder. sys.exit() # write out new .bed after tandem removal write_new_bed(qbed, qdups_to_mother) if not is_self: write_new_bed(sbed, sdups_to_mother) before_filter = len(filtered_blasts) filtered_blasts = list(filter_tandem(filtered_blasts, \ qdups_to_mother, sdups_to_mother)) print >>sys.stderr, "after filter (%d->%d)..." % \ (before_filter, len(filtered_blasts)) qnew_name = "%s.nolocaldups%s" % op.splitext(qbed.filename) snew_name = "%s.nolocaldups%s" % op.splitext(sbed.filename) qbed_new = Bed(qnew_name) sbed_new = Bed(snew_name) qorder = qbed_new.get_order() sorder = sbed_new.get_order() if filter_repeats: before_filter = len(filtered_blasts) print >>sys.stderr, "running the repeat filter", filtered_blasts = list(filter_repeat(filtered_blasts)) print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts)) if cscore: before_filter = len(filtered_blasts) print >>sys.stderr, "running the cscore filter (cscore>=%.2f)..." % cscore filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore)) print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts)) # this is the final output we will write to after BLAST filters raw_name = "%s.raw" % op.splitext(blast_file)[0] raw_fh = open(raw_name, "w") write_raw(qorder, sorder, filtered_blasts, raw_fh) if options.write_filtered_blast: write_new_blast(filtered_blasts)
ax.yaxis.set_major_formatter(formatter) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) root.set_axis_off() print >> sys.stderr, "print image to %s" % image_name plt.savefig(image_name, dpi=600) if __name__ == "__main__": import optparse parser = optparse.OptionParser(__doc__) parser.add_option("--qbed", dest="qbed", help="path to qbed") parser.add_option("--sbed", dest="sbed", help="path to sbed") (options, args) = parser.parse_args() if not (len(args) == 1 and options.qbed and options.sbed): sys.exit(parser.print_help()) qbed = Bed(options.qbed) sbed = Bed(options.sbed) qa_file = args[0] image_name = op.splitext(qa_file)[0] + ".png" dotplot(qa_file, qbed, sbed, image_name)
def __init__(self, bed): self.bed = Bed(bed) self.order = self.bed.get_order()
def dotplot(ax, anchors, qbed, sbed, topn, axes): # modified from Haibao Tang's original function '''function to plot merged topn anchors.''' _ = lambda x: r"$\rm{%s}$" % x.replace(" ", r"\ ") get_order = lambda bed: dict( (f['accn'], (i, f)) for (i, f) in enumerate(bed)) get_len = lambda bed: sum([f.end for (i, f) in enumerate(bed)]) qbed = Bed(qbed) sbed = Bed(sbed) xmax, ymax = len(qbed), get_len(sbed) print xmax, ymax qorder = get_order(qbed) chr_len = [f.end for (i, f) in enumerate(sbed)] # get topn hits data = [] cur_q = "" for anchor in anchors: if anchor[0] != cur_q: n = 1 if n > topn: continue try: qgene = qorder[anchor[0].split('.')[0]] except: continue if 'r' in anchor[0]: continue anchor[4] += sum(chr_len[0:(int(anchor[1].lstrip('chr0')) - 1)]) if qgene[0] < xmax and anchor[4] < ymax: data.append((qgene[0], anchor[4])) cur_q = anchor[0] n += 1 print 'data length: ', len(data) x, y = zip(*data) x, y = np.array(x, 'f') / xmax, np.array(y, 'f') / ymax ax.scatter(x, y, c='b', s=.5, lw=0, alpha=.8) ax.get_xaxis().set_ticks([]) ax.get_yaxis().set_ticks([]) xchr_labels, ychr_labels = [], [] cbreaks = {} # plot the chromosome breaks for (seqid, beg, end) in get_breaks(qbed): if "random" in seqid: continue cbreaks[("query", seqid)] = (beg, end) xchr_labels.append((seqid, (beg + end) / 2)) x, y = np.array([beg, beg], 'f') / xmax, [0, 1] ax.plot(x, y, "-", color='y', alpha=.8, zorder=10) ax.add_patch( Rectangle((.998, 0), .002, 1, lw=.2, color='y', fc='y', fill=True, alpha=.8, zorder=10)) get_breaks_subject = lambda bed: [[f.accn, f.start, f.end] for (i, f) in enumerate(bed)] chr_cum = 0 for items in get_breaks_subject(sbed): seqid, beg, end = items beg += chr_cum end += chr_cum if "random" in seqid: continue cbreaks[("subject", seqid)] = (beg, end) ychr_labels.append((seqid, (beg + end) / 2)) x, y = [0, 1], np.array([beg, beg], 'f') / ymax ax.plot(x, y, "-", color='y', alpha=.8, zorder=10) chr_cum = end ax.add_patch( Rectangle((0, 1), 1, .002, lw=.2, color='y', fc='y', fill=True, alpha=.8, zorder=10)) # plot the chromosome labels for label, pos in xchr_labels: x, y = pos * 1. / xmax - .015, 1.02 if label[0] == "0": label = label.replace("0", "") ax.text(x, y, _("%s" % label)) for label, pos in ychr_labels: x, y = -.065, pos * 1. / ymax - .0065 if label[0] == "0": label = label.replace("0", "") ax.text(x, y, _("%s" % label)) # plot axis labels ax.text(.5, 1.06, _("%s" % axes[0])) ax.text(-.1, .5, _("%s" % axes[1]), rotation="vertical")
def main(anchor_file, blast_file, options): qbed_file, sbed_file = options.qbed, options.sbed # is this a self-self blast? is_self = (qbed_file == sbed_file) if is_self: print >> sys.stderr, "... looks like a self-self BLAST to me" print >> sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file) qbed = Bed(qbed_file) sbed = Bed(sbed_file) qorder = qbed.get_order() sorder = sbed.get_order() _ = lambda x: x.rsplit(".", 1)[0] fp = file(blast_file) print >>sys.stderr, "read BLAST file %s (total %d lines)" % \ (blast_file, sum(1 for line in fp)) fp.seek(0) blasts = sorted([BlastLine(line) for line in fp], \ key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() for b in blasts: query, subject = _(b.query), _(b.subject) if query not in qorder or subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # remove redundant a<->b to one side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q.seqid, s.seqid filtered_blasts.append(b) all_anchors = collections.defaultdict(list) fp = file(anchor_file) for row in fp: if row[0] == '#': continue a, b = row.split() if a not in qorder or b not in sorder: continue qi, q = qorder[a] si, s = sorder[b] all_anchors[(q.seqid, s.seqid)].append((qi, si)) # grouping the hits based on chromosome pair for sending in find_nearby all_hits = collections.defaultdict(list) for b in filtered_blasts: all_hits[(b.qseqid, b.sseqid)].append((b.qi, b.si)) # select hits that are close to the anchor list j = 0 fw = sys.stdout for chr_pair in sorted(all_hits.keys()): hits = np.array(all_hits[chr_pair]) anchors = np.array(all_anchors[chr_pair]) print >> sys.stderr, chr_pair, len(anchors) if len(anchors) == 0: continue tree = cKDTree(anchors, leafsize=16) #print tree.data dists, idxs = tree.query(hits, p=1, distance_upper_bound=options.dist) #print [(d, idx) for (d, idx) in zip(dists, idxs) if idx!=tree.n] for i, (dd, idx) in enumerate(zip(dists, idxs)): if dd == 0: continue # same anchors if idx != tree.n: qi, si = hits[i] query, subject = qbed[qi]["accn"], sbed[si]["accn"] print >> fw, "\t".join((query, subject, "lifted")) j += 1 print >> sys.stderr, j, "new pairs found"