def main(): p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("-p", dest="pvals", help="BED containing all the p values" " used to generate `regions`") p.add_argument("-r", dest="regions", help="BED containing all the regions") p.add_argument("-s", "--step", dest="step", type=int, default=50, help="step size for acf calculation. should be the same " " value as the step sent to -d arg for acf") p.add_argument("-c", dest="c", help="column number containing the p-value" " of interest", type=str, default=-1) p.add_argument("-z", dest="z", help="use z-score correction", action="store_true") args = p.parse_args() if not (args.regions and args.pvals): import sys sys.exit(not p.print_help()) header = ts.nopen(args.regions).next() if header.startswith("#") or (not header.split("\t")[2].isdigit()): print "%s\tslk_p\tslk_sidak_p" % (header.rstrip("\r\n"),) header = ts.header(args.pvals) if args.c in header: args.c = header.index(args.c) + 1 else: args.c = int(args.c) return run(args)
def main(tests, xref, head): df = pandas.read_table(xref, index_col=[0], compression=get_comp(xref)) a, b = get_ab(tests) out_order = header(tests) if head: print "\t".join(out_order) for l in reader(tests): try: if df[a][l['SiteA']] or df[b][l['SiteA']] or df[a][l['SiteB']] or df[b][l['SiteB']]: print "\t".join(l[h] for h in out_order) except KeyError: # definitely not a primary site pass
def main(snpeff, descriptions): snpeff_header = header(snpeff) additional_columns = header(descriptions) full_header = snpeff_header + additional_columns descriptions_dict = descriptions_to_dict(descriptions) print "\t".join(full_header) for l in reader(snpeff): #Gene_ID Gene_name # l['Gene_ID'] add = dict() try: add = descriptions_dict[l['Gene_ID']] except KeyError: try: add = descriptions_dict[l['Gene_name']] except KeyError: print "\t".join(l[h] for h in snpeff_header) continue full_line = dict(l.items() + add.items()) try: assert full_line.get('status') except AssertionError: full_line['status'] = "" print "\t".join(full_line[h] for h in full_header)
def uniprot(args): """Add Uniprot annotation to gene list. Args: genes, uniprotdb, column""" uniprot_db = {} uniprot_header = header(args.uniprotdb) for entry in reader(args.uniprotdb): for gene in entry['Gene names'].split(): uniprot_db[gene] = entry for entry in reader(args.genes, header=False): uniprot_fields = [] for gene in entry[int(args.column) - 1].split(","): uniprot = uniprot_db.get(gene) if uniprot: for h in uniprot_header: uniprot_fields.append(uniprot[h]) print "\t".join(entry) + "\t".join(map(str, uniprot_fields))
def get_col_num(c, bed_file=None): """ adjust the col number so it does intutive stuff for command-line interface >>> get_col_num(4) 3 >>> get_col_num(-1) -1 """ if isinstance(c, basestring) and c.isdigit(): c = int(c) if isinstance(c, (int, long)): return c if c < 0 else (c - 1) header = ts.header(bed_file) assert c in header return header.index(c)
def main(result_files): common_header = header(result_files[0])[:-1] samples = set() for file in result_files: samples.add(op.basename(file).rsplit("_", 2)[0]) # leaves one entry per unique sample for sample in samples: # open a new file to facilitate the join out = gzip.open("{sample}.combined.txt.gz".format(sample=sample), "wb") i = 1 print >>out, "\t".join(common_header) for result_file in result_files: if not op.basename(result_file).startswith(sample): continue for toks in reader(result_file, header=common_header): if not toks['Functionality'] == 'productive': continue toks['Sequence number'] = str(i) print >>out, "\t".join(toks[h] for h in common_header) i += 1 out.close()
def main(args): annotation_dict = make_dict(args.annotated_peaks) gtf = gtf2dict(args.gtf) blast_header = header(args.blast) # only save append these columns onto annotation annotation_save = "baseMeanA baseMeanB pval padj sequence".split() blast_header.extend(annotation_save) # stuff one may want from gtf gtf_save = "symbol ensg enst".split() blast_header.extend(gtf_save) print "\t".join(blast_header) for b in reader(args.blast, header=True): annotation_lookup = annotation_dict[b['qseqid']] # ENST00000576218_- enst = b['sseqid'].split("_")[0] try: gtf_lookup = gtf[enst] except KeyError: gtf_lookup = {'enst':enst, 'ensg':'-', 'symbol':'-'} d = dict(b.items() + annotation_lookup.items() + gtf_lookup.items()) print "\t".join([d[i] for i in blast_header])
def main(deseq_output, ensembl_gtf): p = "%s.pickle" % ensembl_gtf if os.path.exists(p): with open(p, 'rb') as handle: ensembl_translation = pickle.load(handle) else: ensembl_translation = make_ens_table(ensembl_gtf) with open(p, 'wb') as handle: pickle.dump(ensembl_translation, handle) h = header(deseq_output, sep=",") h[0] = 'ensg' h = [i.strip('"') for i in h] h.append('type') print ",".join(h) for toks in reader(deseq_output, sep=","): toks['ensg'] = toks[''] toks['type'] = ensembl_translation[toks['ensembl']] print ",".join(toks[i] for i in h)
def filter(p_bed, region_bed, max_p=None, region_p=None, p_col_name="P.Value", coef_col_name="logFC"): ph = ts.header(p_bed) if (ph[1] + ph[2]).isdigit(): raise Exception("need header in p-value file to run filter") assert ph[1] == "start" and ph[2] == "end" and ph[0] == "chrom", ("must have chrom, start, end header for", p_bed) ph = ["p" + h for h in ph] rh = ts.header(region_bed) header = not (rh[1] + rh[2]).isdigit() if isinstance(p_col_name, str) and p_col_name.isdigit(): p_col_name = int(p_col_name) - 1 if isinstance(p_col_name, (int, long)): p_col_name = ph[p_col_name][1:] a = dict(p_bed=p_bed, region_bed=region_bed) a["p_bed"] = fix_bed(a["p_bed"]) a["header"] = "" j = 0 for group, plist in groupby( ts.reader( "|bedtools intersect -b %(p_bed)s \ -a %(region_bed)s -wo %(header)s" % a, header=rh + ph, ), itemgetter("chrom", "start", "end"), ): plist = list(plist) if region_p: r = plist[0] # first cols are all the same region_p_key = "slk_sidak_p" if "slk_sidak_p" in r else "z_sidak_p" if "z_sidak_p" in r else None if region_p_key is None: raise Exception if float(r[region_p_key]) > region_p: continue try: plist = [ x for x in plist if (int(x["start"]) <= int(x["pstart"]) <= int(x["pend"])) and ((int(x["start"]) <= int(x["pend"]) <= int(x["end"]))) ] except: print plist raise tscores = [float(row["pt"]) for row in plist if "pt" in row] if max_p: if any(float(row["p" + p_col_name]) > max_p for row in plist): continue ngt05 = sum(1 for row in plist if float(row["p" + p_col_name]) > 0.05) # logic to try to find t and coef headers and skip if not found extra_header = [] extra = [] if tscores: tpos = sum(1 for ts in tscores if ts > 0) tneg = sum(1 for ts in tscores if ts < 0) tpn = "%i/%i" % (tpos, tneg) tsum = str(sum(ts for ts in tscores)) extra_header += ["t.pos/t.neg", "t.sum"] extra += [tpn, tsum] else: tsum = tpn = "NA" if "p" + coef_col_name not in plist[0] and "pcoefficient" in plist[0]: coef_col_name = "coefficient" if "p" + coef_col_name in plist[0]: coef = sum(float(row["p" + coef_col_name]) for row in plist) / len(plist) # since we probably had the data logit transformed, here we # do the inverse and subtract 0.5 since ilogit(0) == 0.5 icoef = (sum(ilogit(float(row["p" + coef_col_name])) for row in plist) / len(plist)) - 0.5 extra_header += ["avg.diff", "ilogit.diff"] extra += ["%.3f" % coef, "%.3f" % icoef] else: coef = icoef = float("nan") frow = [plist[0][h] for h in rh] + extra if j == 0: yield rh + extra_header j = 1 yield frow
def pipeline(col_num, step, dist, acf_dist, prefix, threshold, seed, bed_files, mlog=True, region_filter_p=1, region_filter_n=None, genome_control=False, db=None, use_fdr=True): sys.path.insert(0, op.join(op.dirname(__file__), "..")) from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter from cpv._common import genome_control_adjust, genomic_control, bediter import operator if step is None: step = min(acf_dist, stepsize.stepsize(bed_files, col_num)) print >> sys.stderr, "calculated stepsize as: %i" % step lags = range(1, acf_dist, step) lags.append(lags[-1] + step) prefix = prefix.rstrip(".") putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False, mlog=mlog) acf_vals = [] # go out to max requested distance but stop once an autocorrelation # < 0.05 is added. for a in putative_acf_vals: # a is ((lmin, lmax), (corr, N)) # this heuristic seems to work. stop just above the 0.08 correlation # lag. if a[1][0] < 0.04 and len(acf_vals) > 2: break acf_vals.append(a) if a[1][0] < 0.04 and len(acf_vals): break # save the arguments that this was called with. with open(prefix + ".args.txt", "w") as fh: print >> fh, " ".join(sys.argv[1:]) + "\n" import datetime print >> fh, "date: %s" % datetime.datetime.today() from .__init__ import __version__ print >> fh, "version:", __version__ with open(prefix + ".acf.txt", "w") as fh: acf_vals = acf.write_acf(acf_vals, fh) print >> sys.stderr, "wrote: %s" % fh.name print >> sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read() spvals, opvals = [], [] with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk: fhslk.write('#chrom\tstart\tend\tp\tregion-p\n') for row in slk.adjust_pvals(bed_files, col_num, acf_vals): fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row) opvals.append(row[-2]) spvals.append(row[-1]) print >> sys.stderr, "# original lambda: %.2f" % genomic_control(opvals) del opvals gc_lambda = genomic_control(spvals) print >> sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda) if genome_control: fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w") adj = genome_control_adjust( [d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)]) for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")): print >> fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i]) fhslk.close() print >> sys.stderr, "wrote: %s" % fhslk.name with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh: fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n') for bh, l in fdr.fdr(fhslk.name, -1): fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh)) print >> sys.stderr, "wrote: %s" % fh.name fregions = prefix + ".regions.bed.gz" with ts.nopen(fregions, "w") as fh: list( peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2, threshold, seed, dist, fh, operator.le)) n_regions = sum(1 for _ in ts.nopen(fregions)) print >> sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions) if n_regions == 0: sys.exit() with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh: N = 0 fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n") # use -2 for original, uncorrected p-values in slk.bed for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p( prefix + ".slk.bed.gz", prefix + ".regions.bed.gz", -2, step): fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p)) fh.flush() N += int(slk_sidak_p < 0.05) print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \ % (fh.name, N) regions_bed = fh.name header = ts.header(bed_files[0]) #if all(h in header for h in ('t', 'start', 'end')): if region_filter_n is None: region_filter_n = 0 with ts.nopen(prefix + ".regions-t.bed", "w") as fh: N = 0 for i, toks in enumerate( filter.filter(bed_files[0], regions_bed, p_col_name=col_num)): if i == 0: toks[0] = "#" + toks[0] else: if float(toks[6]) > region_filter_p: continue if int(toks[4]) < region_filter_n: continue #if region_filter_t and "/" in toks[7]: # # t-pos/t-neg. if the lower one is > region_filter_t? # vals = map(int, toks[7].split("/")) # if min(vals) > region_filter_t: continue N += 1 print >> fh, "\t".join(toks) print >>sys.stderr, ("wrote: %s, (regions with region-p " "< %.3f and n-probes >= %i: %i)") \ % (fh.name, region_filter_p, region_filter_n, N) try: from cpv import manhattan regions = manhattan.read_regions(fh.name) manhattan.manhattan(prefix + ".slk.bed.gz", 3, prefix.rstrip(".") + ".manhattan.png", False, ['#959899', '#484B4C'], "", False, None, regions=regions, bonferonni=False) except ImportError: pass # they dont have matplotlib if db is not None: from cruzdb import Genome g = Genome(db) lastf = fh.name with open(prefix + ".anno.%s.bed" % db, "w") as fh: fh.write('#') g.annotate(lastf, ("refGene", "cpgIslandExt"), out=fh, feature_strand=True, parallel=len(spvals) > 500) print >> sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)
import sys import toolshed as ts from sklearn import preprocessing import numpy as np regions, weighted = [], [] totlen = 0.0 for d in ts.reader(sys.argv[1]): totlen += int(d['end']) - int(d['start']) regions.append(d) header = ts.header(sys.argv[1]) print "\t".join(header) + "\t" + "weighted_pct" pct = 100.0 regionlength = 0 opct = regions[0]['resid_pctile'] for d in regions: regionlength += int(d['end']) - int(d['start']) if d['resid_pctile'] != opct: pct -= regionlength / totlen * 100 regionlength = 0 opct = d['resid_pctile'] weighted.append(pct) X_train = np.array(weighted).reshape(len(weighted), 1) min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 100)) resid_pctile = min_max_scaler.fit_transform(X_train) for i, d in enumerate(regions): print "\t".join(d[h] for h in header) + "\t" + "%.9f" % resid_pctile[i]
def filter(p_bed, region_bed, max_p=None, region_p=None, p_col_name="P.Value", coef_col_name="logFC"): ph = ts.header(p_bed) if (ph[1] + ph[2]).isdigit(): raise Exception('need header in p-value file to run filter') assert ph[1] == 'start' and ph[2] == 'end' and ph[0] == 'chrom', \ ('must have chrom, start, end header for', p_bed) ph = ['p' + h for h in ph] rh = ts.header(region_bed) header = not (rh[1] + rh[2]).isdigit() if isinstance(p_col_name, str) and p_col_name.isdigit(): p_col_name = int(p_col_name) - 1 if isinstance(p_col_name, (int, long)): p_col_name = ph[p_col_name][1:] a = dict(p_bed=p_bed, region_bed=region_bed) a['p_bed'] = fix_bed(a['p_bed']) a['header'] = "" j = 0 for group, plist in groupby( ts.reader('|bedtools intersect -b %(p_bed)s \ -a %(region_bed)s -wo %(header)s' % a, header=rh + ph), itemgetter('chrom', 'start', 'end')): plist = list(plist) if region_p: r = plist[0] # first cols are all the same region_p_key = 'slk_sidak_p' if 'slk_sidak_p' in r \ else 'z_sidak_p' if 'z_sidak_p' in r \ else None if region_p_key is None: raise Exception if float(r[region_p_key]) > region_p: continue try: plist = [ x for x in plist if (int(x['start']) <= int(x['pstart']) <= int(x['pend'])) and ((int(x['start']) <= int(x['pend']) <= int(x['end']))) ] except: print(plist) raise tscores = [float(row['pt']) for row in plist if 'pt' in row] if max_p: if any(float(row['p' + p_col_name]) > max_p for row in plist): continue ngt05 = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.05) # logic to try to find t and coef headers and skip if not found extra_header = [] extra = [] if tscores: tpos = sum(1 for ts in tscores if ts > 0) tneg = sum(1 for ts in tscores if ts < 0) tpn = "%i/%i" % (tpos, tneg) tsum = str(sum(ts for ts in tscores)) extra_header += ["t.pos/t.neg", "t.sum"] extra += [tpn, tsum] else: tsum = tpn = "NA" if 'p' + coef_col_name not in plist[0] and 'pcoefficient' in plist[0]: coef_col_name = 'coefficient' if 'p' + coef_col_name in plist[0]: coef = (sum(float(row['p' + coef_col_name]) for row in plist) / len(plist)) # since we probably had the data logit transformed, here we # do the inverse and subtract 0.5 since ilogit(0) == 0.5 icoef = ( sum(ilogit(float(row['p' + coef_col_name])) for row in plist) / len(plist)) - 0.5 extra_header += ["avg.diff", "ilogit.diff"] extra += ["%.3f" % coef, "%.3f" % icoef] else: coef = icoef = float('nan') frow = [plist[0][h] for h in rh] + extra if j == 0: yield rh + extra_header j = 1 yield frow
def pipeline(col_num, step, dist, acf_dist, prefix, threshold, seed, bed_files, mlog=True, region_filter_p=1, region_filter_n=None, genome_control=False, db=None, use_fdr=True): sys.path.insert(0, op.join(op.dirname(__file__), "..")) from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter from cpv._common import genome_control_adjust, genomic_control, bediter import operator if step is None: step = min(acf_dist, stepsize.stepsize(bed_files, col_num)) print >>sys.stderr, "calculated stepsize as: %i" % step lags = range(1, acf_dist, step) lags.append(lags[-1] + step) prefix = prefix.rstrip(".") putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False, mlog=mlog) acf_vals = [] # go out to max requested distance but stop once an autocorrelation # < 0.05 is added. for a in putative_acf_vals: # a is ((lmin, lmax), (corr, N)) # this heuristic seems to work. stop just above the 0.08 correlation # lag. if a[1][0] < 0.04 and len(acf_vals) > 2: break acf_vals.append(a) if a[1][0] < 0.04 and len(acf_vals): break # save the arguments that this was called with. with open(prefix + ".args.txt", "w") as fh: print >>fh, " ".join(sys.argv[1:]) + "\n" import datetime print >>fh, "date: %s" % datetime.datetime.today() from .__init__ import __version__ print >>fh, "version:", __version__ with open(prefix + ".acf.txt", "w") as fh: acf_vals = acf.write_acf(acf_vals, fh) print >>sys.stderr, "wrote: %s" % fh.name print >>sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read() spvals, opvals = [], [] with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk: fhslk.write('#chrom\tstart\tend\tp\tregion-p\n') for row in slk.adjust_pvals(bed_files, col_num, acf_vals): fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row) opvals.append(row[-2]) spvals.append(row[-1]) print >>sys.stderr, "# original lambda: %.2f" % genomic_control(opvals) del opvals gc_lambda = genomic_control(spvals) print >>sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda) if genome_control: fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w") adj = genome_control_adjust([d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)]) for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")): print >>fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i]) fhslk.close() print >>sys.stderr, "wrote: %s" % fhslk.name with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh: fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n') for bh, l in fdr.fdr(fhslk.name, -1): fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh)) print >>sys.stderr, "wrote: %s" % fh.name fregions = prefix + ".regions.bed.gz" with ts.nopen(fregions, "w") as fh: list(peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2, threshold, seed, dist, fh, operator.le)) n_regions = sum(1 for _ in ts.nopen(fregions)) print >>sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions) if n_regions == 0: sys.exit() with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh: N = 0 fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n") # use -2 for original, uncorrected p-values in slk.bed for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p( prefix + ".slk.bed.gz", prefix + ".regions.bed.gz", -2, step): fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p)) fh.flush() N += int(slk_sidak_p < 0.05) print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \ % (fh.name, N) regions_bed = fh.name header = ts.header(bed_files[0]) #if all(h in header for h in ('t', 'start', 'end')): if region_filter_n is None: region_filter_n = 0 with ts.nopen(prefix + ".regions-t.bed", "w") as fh: N = 0 for i, toks in enumerate(filter.filter(bed_files[0], regions_bed, p_col_name=col_num)): if i == 0: toks[0] = "#" + toks[0] else: if float(toks[6]) > region_filter_p: continue if int(toks[4]) < region_filter_n: continue #if region_filter_t and "/" in toks[7]: # # t-pos/t-neg. if the lower one is > region_filter_t? # vals = map(int, toks[7].split("/")) # if min(vals) > region_filter_t: continue N += 1 print >>fh, "\t".join(toks) print >>sys.stderr, ("wrote: %s, (regions with region-p " "< %.3f and n-probes >= %i: %i)") \ % (fh.name, region_filter_p, region_filter_n, N) try: from cpv import manhattan regions = manhattan.read_regions(fh.name) manhattan.manhattan(prefix + ".slk.bed.gz", 3, prefix.rstrip(".") + ".manhattan.png", False, ['#959899', '#484B4C'], "", False, None, regions=regions, bonferonni=False) except ImportError: pass # they dont have matplotlib if db is not None: from cruzdb import Genome g = Genome(db) lastf = fh.name with open(prefix + ".anno.%s.bed" % db, "w") as fh: fh.write('#') g.annotate(lastf, ("refGene", "cpgIslandExt"), out=fh, feature_strand=True, parallel=len(spvals) > 500) print >>sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)