def generatePeakSets(infile, outfiles): outf_con, outf_opt = outfiles # retrieve maximum number of peaks obtained from inter-replicate IDR statement = ("SELECT" " experiment," " max(n_peaks) AS nPeaks" " FROM individual_replicates_nPeaks" " GROUP BY experiment") df = PU.fetch_DataFrame(statement) # reassign experiment as index df = df.set_index("experiment") # retrieve number of peaks obtained from pooled_pseudoreplicate IDR statement = ("SELECT" " experiment," " n_peaks AS nPeaks" " FROM pooled_pseudoreplicates_nPeaks") df2 = PU.fetch_DataFrame(statement) # reassign experiment as index df2 = df2.set_index("experiment") # split the infile name to obtain experiment sample_id = os.path.basename(infile).split("_VS_")[0] sample = sample_id.split("-") experiment = "_".join([sample[0], sample[1]]) # retrieve max_numPeaks for experiment nPeaks = int(df.loc[experiment]) # retrieve numPeaks_Rep0 for experiment nPeaks_rep0 = int(df2.loc[experiment]) # retrieve maximumn of the two nPeaks_max = max(nPeaks, nPeaks_rep0) # establish which column to sort by if PARAMS["idr_options_ranking_measure"] == "signal.value": sort_statement = "sort -k7nr,7nr" elif PARAMS["idr_options_ranking_measure"] == "p.value": sort_statement = "sort -k8nr,8nr" elif PARAMS["idr_options_ranking_measure"] == "q.value": sort_statement = "sort -k9nr,9nr" else: raise ValueError("Unrecognised ranking_measure" " %s don't know which column" " to sort on" % PARAMS["idr_options_ranking_measure"]) # sort infile by column and write top nPeaks to outfile (conservative) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks)s |" " gzip > %(outf_con)s") P.run() # sort infile by column and write top nPeaks_max to outfile (optimum) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks)s |" " gzip > %(outf_opt)s") P.run()
def interactions2BedGraph(track, probe, db, outfile, chrom=None, start=None, end=None, region_size=1000000, window=2000, step=200): import CGATPipelines.PipelineUtilities as PUtils import pandas import numpy as np E.debug("Get Probe fragment") #get probe fragment statement = '''SELECT chr, start, end, fragment FROM probe_fragments INNER JOIN probe_fragments_lookup as lu ON probe_fragments.name = lu.probe WHERE name = '%(probe)s' ''' probes = PUtils.fetch_DataFrame(statement % locals(), database=db) probe_frag = probes.fragment.iloc[0] if chrom is None: probe_chr = probes.chr.iloc[0] probe_centre = 0.5 * (probes.start.iloc[0] + probes.end.iloc[0]) if start is None: start = probe_centre - region_size / 2 if end is None: end = probe_centre + region_size / 2 E.debug("Fetch data") statement = '''SELECT (start+end)/2 as centre, count FROM interaction_counts as ic INNER JOIN fragments ON fragments.name = ic.Frag2 WHERE abs(%(probe_frag)s - ic.Frag2) > 1 AND Frag1 = '%(probe_frag)s' AND track = '%(track)s' AND chr='%(probe_chr)s' AND centre > %(start)s AND centre <= %(end)s AND NOT ABS(centre - %(probe_start)) < 2000 ORDER BY centre ''' E.debug(statement % locals()) interactions = PUtils.fetch_DataFrame(statement % locals(), db) E.debug("Got %i interacting fragments" % interactions.shape[0]) interactions["centre"] = interactions["centre"].astype("float64") interactions.set_index("centre", inplace=True) interactions = interactions.sort_index() window_starts = pandas.Series(np.arange(start, end, step)) window_starts.index = window_starts.values window_starts.index.name = "start" def _applyToWindow(val): #E.debug(val) return interactions.loc[(val - window / 2):(val + window / 2)]["Count"].sum() E.debug("rolling") windowed = window_starts.apply(_applyToWindow) assert windowed.sum() >= interactions["Count"].sum(), \ "windowed sum %s, total count %s" % (windowed.sum(), interactions["Count"].sum()) E.debug("format output") windowed = windowed.reset_index() windowed["end"] = windowed["start"] + step windowed["chr"] = probe_chr windowed = windowed[["chr", "start", "end", 0]] windowed.to_csv(IOTools.openFile(outfile, "w"), sep="\t", index=False, header=False)
def interactions2BedGraph(track, probe, db, outfile, chrom=None, start=None, end=None, region_size=1000000, window=2000, step=200): import CGATPipelines.PipelineUtilities as PUtils import pandas import numpy as np E.debug("Get Probe fragment") #get probe fragment statement = '''SELECT chr, start, end, fragment FROM probe_fragments INNER JOIN probe_fragments_lookup as lu ON probe_fragments.name = lu.probe WHERE name = '%(probe)s' ''' probes = PUtils.fetch_DataFrame(statement % locals(), database=db) probe_frag = probes.fragment.iloc[0] if chrom is None: probe_chr = probes.chr.iloc[0] probe_centre = 0.5*(probes.start.iloc[0] + probes.end.iloc[0]) if start is None: start = probe_centre - region_size/2 if end is None: end = probe_centre + region_size/2 E.debug("Fetch data") statement = '''SELECT (start+end)/2 as centre, count FROM interaction_counts as ic INNER JOIN fragments ON fragments.name = ic.Frag2 WHERE abs(%(probe_frag)s - ic.Frag2) > 1 AND Frag1 = '%(probe_frag)s' AND track = '%(track)s' AND chr='%(probe_chr)s' AND centre > %(start)s AND centre <= %(end)s AND NOT ABS(centre - %(probe_start)) < 2000 ORDER BY centre ''' E.debug(statement % locals()) interactions = PUtils.fetch_DataFrame(statement % locals(), db) E.debug("Got %i interacting fragments" % interactions.shape[0]) interactions["centre"] = interactions["centre"].astype("float64") interactions.set_index("centre", inplace=True) interactions = interactions.sort_index() window_starts = pandas.Series(np.arange(start, end, step)) window_starts.index = window_starts.values window_starts.index.name = "start" def _applyToWindow(val): #E.debug(val) return interactions.loc[(val-window/2):(val+window/2)]["Count"].sum() E.debug("rolling") windowed = window_starts.apply(_applyToWindow) assert windowed.sum() >= interactions["Count"].sum(), \ "windowed sum %s, total count %s" % (windowed.sum(), interactions["Count"].sum()) E.debug("format output") windowed = windowed.reset_index() windowed["end"] = windowed["start"] + step windowed["chr"] = probe_chr windowed = windowed[["chr", "start", "end", 0]] windowed.to_csv(IOTools.openFile(outfile, "w"), sep = "\t", index=False, header=False)