def test_cobble(self): # more tests below stream = fstream([('chr1',10,20,'A',1),('chr1',12,22,'B',-1),('chr1',15,25,'C',-1)], fields = ['chr','start','end','name','strand']) expected = [('chr1',10,12,'A',1), ('chr1',12,15,'A|B',0), ('chr1',15,20,'A|B|C',0), ('chr1',20,22,'B|C',-1), ('chr1',22,25,'C',-1)] res = list(cobble(stream)) self.assertEqual(res,expected) # stranded = True stream = fstream([('chr1',10,20,'A',1),('chr1',12,22,'B',-1),('chr1',15,25,'C',-1)], fields = ['chr','start','end','name','strand']) expected = [('chr1',10,20,'A',1), ('chr1',12,15,'B',-1), ('chr1',15,22,'B|C',-1), ('chr1',22,25,'C',-1)] res = list(cobble(stream,stranded=True)) self.assertEqual(res,expected) # scored = True stream = fstream([('chr1',10,20,'A',1,50.0),('chr1',12,22,'B',-1,100.0),('chr1',15,65,'C',-1,20.0)], fields = ['chr','start','end','name','strand','score']) expected = [('chr1',10,12,'A',1, 10.0), ('chr1',12,15,'A|B',0, 45.0), ('chr1',15,20,'A|B|C',0, 77.0), ('chr1',20,22,'B|C',-1, 20.8), ('chr1',22,65,'C',-1, 17.2)] res = list(cobble(stream,scored=True)) self.assertEqual(res,expected)
def test_cobble(self): # more tests below stream = fstream([('chr1', 10, 20, 'A', 1), ('chr1', 12, 22, 'B', -1), ('chr1', 15, 25, 'C', -1)], fields=['chr', 'start', 'end', 'name', 'strand']) expected = [('chr1', 10, 12, 'A', 1), ('chr1', 12, 15, 'A|B', 0), ('chr1', 15, 20, 'A|B|C', 0), ('chr1', 20, 22, 'B|C', -1), ('chr1', 22, 25, 'C', -1)] res = list(cobble(stream)) self.assertEqual(res, expected) # stranded = True stream = fstream([('chr1', 10, 20, 'A', 1), ('chr1', 12, 22, 'B', -1), ('chr1', 15, 25, 'C', -1)], fields=['chr', 'start', 'end', 'name', 'strand']) expected = [('chr1', 10, 20, 'A', 1), ('chr1', 12, 15, 'B', -1), ('chr1', 15, 22, 'B|C', -1), ('chr1', 22, 25, 'C', -1)] res = list(cobble(stream, stranded=True)) self.assertEqual(res, expected) # scored = True stream = fstream( [('chr1', 10, 20, 'A', 1, 50.0), ('chr1', 12, 22, 'B', -1, 100.0), ('chr1', 15, 65, 'C', -1, 20.0)], fields=['chr', 'start', 'end', 'name', 'strand', 'score']) expected = [('chr1', 10, 12, 'A', 1, 10.0), ('chr1', 12, 15, 'A|B', 0, 45.0), ('chr1', 15, 20, 'A|B|C', 0, 77.0), ('chr1', 20, 22, 'B|C', -1, 20.8), ('chr1', 22, 65, 'C', -1, 17.2)] res = list(cobble(stream, scored=True)) self.assertEqual(res, expected)
def combine(trackList, fn, win_size=1000, aggregate={}): """ Applies a custom function to a list of tracks, such as union, intersection, etc., and return a single result track. The input streams need to be ordered w.r.t 'chr', 'start' and 'end'. To be applied chromosome by chromosome. Only fields of the first track are kept. Values for a common field are merged by default according to `common.strand_merge`,`common.no_merge` and `common.generic_merge`, respectively for strand, chromosome and all others. :param trackList: list of FeatureStream objects. :param fn: boolean function to apply, such as bbcflib.gfminer.stream.union. :param win_size: (int) window size, in bp. :param aggregate: (dict) for each field name given as a key, its value is the function to apply to the vector containing all trackList's values for this field in order to merge them. E.g. ``{'score': lambda x: sum(x)/len(x)}`` will return the average of all *trackList*'s scores in the output. :rtype: FeatureStream """ aggregate.setdefault('strand',common.strand_merge) aggregate.setdefault('chr',common.no_merge) _f = ['start','end'] if all('chr' in t.fields for t in trackList): _f += ['chr'] if isinstance(fn,str): fn = eval(fn) # can type "combine(...,fn='intersection')" trackList = [common.cobble(common.reorder(t,fields=_f)) for t in trackList] return common.fusion(FeatureStream(_combine(trackList,fn,win_size,aggregate), fields=trackList[0].fields))
def fimo(motifs,fasta,qval=True): # Run Fimo if qval: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.01 --qv-thresh" else: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.000001" cmd = "fimo " + options + " %s %s" % (motifs, fasta) print "Running >>",cmd os.system(cmd) os.system("sort -k2,2n -k3,3n -k4,4n fimo_out/fimo.txt > fimo.txt") # Bed output t = track('fimo.txt', fields=["name","chr","start","end","strand","score","p-value","q-value","sequence"]) t.fields = ["name","chr","start","end","strand","a","score","q","sequence"] s = t.read() s = select(s,['chr','start','end','name','score','strand']) s = apply(s,'chr',lambda x:x.split('|')[1]) s = sorted_stream(s) s = cobble(s) s = apply(s,'name',lambda x:'|'.join(list(set(x.split('|'))))) outname = 'fimo.bed' bed = track(outname,fields=s.fields) bed.make_header(name="TSS_motifs", description="Motifs +-XKb around TSS", mode='overwrite') bed.write(s) if os.path.exists("fimo_out"): shutil.rmtree("fimo_out")
def fimo(motifs, fasta, qval=True): # Run Fimo if qval: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.01 --qv-thresh" else: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.000001" cmd = "fimo " + options + " %s %s" % (motifs, fasta) print "Running >>", cmd os.system(cmd) os.system("sort -k2,2n -k3,3n -k4,4n fimo_out/fimo.txt > fimo.txt") # Bed output t = track('fimo.txt', fields=[ "name", "chr", "start", "end", "strand", "score", "p-value", "q-value", "sequence" ]) t.fields = [ "name", "chr", "start", "end", "strand", "a", "score", "q", "sequence" ] s = t.read() s = select(s, ['chr', 'start', 'end', 'name', 'score', 'strand']) s = apply(s, 'chr', lambda x: x.split('|')[1]) s = sorted_stream(s) s = cobble(s) s = apply(s, 'name', lambda x: '|'.join(list(set(x.split('|'))))) outname = 'fimo.bed' bed = track(outname, fields=s.fields) bed.make_header(name="TSS_motifs", description="Motifs +-XKb around TSS", mode='overwrite') bed.write(s) if os.path.exists("fimo_out"): shutil.rmtree("fimo_out")
def __call__(self, **kw): def _parse_logic(string): s = re.sub(r'[^\w\d!=><\. ]', '', string) s = re.sub(r' OR ', ')or(%f ', s) s = re.sub(r' AND ', ')and(%f ', s) return "(%f "+s+")" def _run_test(row, indx, cond): num = float(row[col_ind[indx]]) num = max(-sys.maxint,min(sys.maxint,num)) num = (num,)*c.count("%f") return eval(cond % (num)) def _add_label(s,x): _f = s.fields+['track_name'] return FeatureStream((y+(x,) for y in s), fields=_f) venn_options = {} # tune it here tracks = [] intype = kw.get("input_type") or "Table" if intype == "Table": s_cols = kw.get('id_columns','') s_filters = kw.get('filters','') infile = track(kw.get('table',''),format='txt',header=True) col_ind = [int(i)-1 for i in s_cols.split(",")] legend = [infile.fields[i] if i<len(infile.fields) else str(i) for i in col_ind] conds = [_parse_logic(x) for x in s_filters.split(",")] tlabels = [chr(k+65) for k in range(len(col_ind))] conds += ["1"]*(len(col_ind)-len(conds)) combn = [tuple(sorted(x)) for k in range(len(tlabels)) for x in combinations(tlabels,k+1)] c1 = dict(("|".join(c),0) for c in combn) c2 = dict(("|".join(c),0) for c in combn) indx = dict((c,[tlabels.index(x) for x in c]) for c in combn) for row in infile: tests = [_run_test(row,i,c) for i,c in enumerate(conds)] for c in combn: c1["|".join([tlabels[n] for n,t in enumerate(tests) if t])] += 1 c2["|".join(c)] += all([tests[i] for i in indx[c]]) nsamples = len(col_ind) combn = ['|'.join(y) for x in combn for y in x] elif intype == "Tracks": #filenames = kw['TrMulti']['files'] filenames = kw['files'] if not isinstance(filenames,(list,tuple)): filenames = [filenames] for f in filenames: assert os.path.exists(f), "File not found: %s ." % f tracks = [track(f,chrmeta='guess') for f in filenames] nsamples = len(tracks) tlabels = [chr(k+65) for k in range(len(tracks))] combn = [combinations(tlabels,k+1) for k in range(len(tlabels))] combn = ['|'.join(sorted(y)) for x in combn for y in x] c1 = dict(zip(combn,[0]*len(combn))) c2 = dict(zip(combn,[0]*len(combn))) total_cov = 0.0 _scored = (kw.get('type') == 'score') chromset = set([c for t in tracks for c in t.chrmeta]) for chrom in chromset: streams = [_add_label(t.read(chrom),tlabels[n]) for n,t in enumerate(tracks)] s = cobble(concatenate(streams),scored=_scored) name_idx = s.fields.index('track_name') start_idx = s.fields.index('start') end_idx = s.fields.index('end') if _scored: score_idx = s.fields.index('score') for x in s: length = x[end_idx]-x[start_idx] total_cov += length sub = sorted(list(set(x[name_idx].split('|')))) # avoid 'A|A' cb = [combinations(sub,k) for k in range(1,len(sub)+1)] cb = ['|'.join(sorted(y)) for c in cb for y in c] if _scored: c1['|'.join(sub)] += x[score_idx] for c in cb: c2[c] += x[score_idx] else: c1['|'.join(sub)] += length for c in cb: c2[c] += length if total_cov < 1: output = self.temporary_path(fname='venn_summary.txt') with open(output,'wb') as summary: summary.write("Empty content (no coverage) on %s." %(",".join(chromset))) self.new_file(output, 'venn_summary') return legend = [t.name for t in tracks] if _scored: for c in combn: c2[c] = round(c2[c]) else: for c in combn: c2[c] = round((100*c2[c])/total_cov) c1[c] = (100*c1[c])/total_cov else: raise ValueError("Input type '%s' not supported." %intype) if nsamples <= 4: format = kw.get('output') or 'pdf' output = self.temporary_path(fname='venn_diagram.'+format) venn(c2,legend=legend,options=venn_options,output=output,format=format) self.new_file(output, 'venn_diagram') # Text summary output = self.temporary_path(fname='venn_summary.txt') with open(output,'w') as summary: summary.write("%s\t%s\t%s\n" % ("Group","Coverage", "Cumulative coverage")) record = "%s\t%.2f\t%d\n" for c in sorted(combn, key=lambda x:(len(x),x)): summary.write(record%(c,c1[c],c2[c])) self.new_file(output, 'venn_summary') return self.display_time()
def commonTest(self, X, R): T = list(cobble(fstream(X, fields=['chr', 'start', 'end', 'score']))) print T self.assertEqual(T, R)
def __call__(self, **kw): def _parse_logic(string): s = re.sub(r'[^\w\d!=><\. ]', '', string) s = re.sub(r' OR ', ')or(%f ', s) s = re.sub(r' AND ', ')and(%f ', s) return "(%f " + s + ")" def _run_test(row, indx, cond): num = float(row[col_ind[indx]]) num = max(-sys.maxint, min(sys.maxint, num)) num = (num, ) * c.count("%f") return eval(cond % (num)) def _add_label(s, x): _f = s.fields + ['track_name'] return FeatureStream((y + (x, ) for y in s), fields=_f) venn_options = {} # tune it here tracks = [] intype = kw.get("input_type") or "Table" if intype == "Table": s_cols = kw.get('id_columns', '') s_filters = kw.get('filters', '') infile = track(kw.get('table', ''), format='txt', header=True) col_ind = [int(i) - 1 for i in s_cols.split(",")] legend = [ infile.fields[i] if i < len(infile.fields) else str(i) for i in col_ind ] conds = [_parse_logic(x) for x in s_filters.split(",")] tlabels = [chr(k + 65) for k in range(len(col_ind))] conds += ["1"] * (len(col_ind) - len(conds)) combn = [ tuple(sorted(x)) for k in range(len(tlabels)) for x in combinations(tlabels, k + 1) ] c1 = dict(("|".join(c), 0) for c in combn) c2 = dict(("|".join(c), 0) for c in combn) indx = dict((c, [tlabels.index(x) for x in c]) for c in combn) for row in infile: tests = [_run_test(row, i, c) for i, c in enumerate(conds)] for c in combn: c1["|".join([tlabels[n] for n, t in enumerate(tests) if t])] += 1 c2["|".join(c)] += all([tests[i] for i in indx[c]]) nsamples = len(col_ind) combn = ['|'.join(y) for x in combn for y in x] elif intype == "Tracks": filenames = kw['TrMulti']['files'] if not isinstance(filenames, (list, tuple)): filenames = [filenames] for f in filenames: assert os.path.exists(f), "File not found: %s ." % f tracks = [track(f, chrmeta='guess') for f in filenames] nsamples = len(tracks) tlabels = [chr(k + 65) for k in range(len(tracks))] combn = [combinations(tlabels, k + 1) for k in range(len(tlabels))] combn = ['|'.join(sorted(y)) for x in combn for y in x] c1 = dict(zip(combn, [0] * len(combn))) c2 = dict(zip(combn, [0] * len(combn))) total_cov = 0.0 _scored = (kw.get('type') == 'score') chromset = set([c for t in tracks for c in t.chrmeta]) for chrom in chromset: streams = [ _add_label(t.read(chrom), tlabels[n]) for n, t in enumerate(tracks) ] s = cobble(concatenate(streams), scored=_scored) name_idx = s.fields.index('track_name') start_idx = s.fields.index('start') end_idx = s.fields.index('end') if _scored: score_idx = s.fields.index('score') for x in s: length = x[end_idx] - x[start_idx] total_cov += length sub = sorted(list(set( x[name_idx].split('|')))) # avoid 'A|A' cb = [combinations(sub, k) for k in range(1, len(sub) + 1)] cb = ['|'.join(sorted(y)) for c in cb for y in c] if _scored: c1['|'.join(sub)] += x[score_idx] for c in cb: c2[c] += x[score_idx] else: c1['|'.join(sub)] += length for c in cb: c2[c] += length if total_cov < 1: output = self.temporary_path(fname='venn_summary.txt') with open(output, 'wb') as summary: summary.write("Empty content (no coverage) on %s." % (",".join(chromset))) self.new_file(output, 'venn_summary') return legend = [t.name for t in tracks] if _scored: for c in combn: c2[c] = round(c2[c]) else: for c in combn: c2[c] = round((100 * c2[c]) / total_cov) c1[c] = (100 * c1[c]) / total_cov else: raise ValueError("Input type '%s' not supported." % intype) if nsamples <= 4: format = kw.get('format') or 'pdf' output = self.temporary_path(fname='venn_diagram.' + format) venn(c2, legend=legend, options=venn_options, output=output, format=format) self.new_file(output, 'venn_diagram') # Text summary output = self.temporary_path(fname='venn_summary.txt') with open(output, 'w') as summary: summary.write("%s\t%s\t%s\n" % ("Group", "Coverage", "Cumulative coverage")) record = "%s\t%.2f\t%d\n" for c in sorted(combn, key=lambda x: (len(x), x)): summary.write(record % (c, c1[c], c2[c])) self.new_file(output, 'venn_summary') return self.display_time()
def commonTest(self,X,R): T = list(cobble(fstream(X,fields=['chr','start','end','score']))) print T self.assertEqual(T,R)