def __call__(self, **kw): assembly_id = kw.get('assembly') or None assembly = genrep.Assembly(assembly_id) tinput = track(kw.get('track'), chrmeta=assembly.chrmeta) try: thPromot = int(kw.get("promoter")) except (ValueError, TypeError): thPromot = prom_def try: thInter = int(kw.get("intergenic")) except (ValueError, TypeError): thInter = inter_def try: thUTR = int(kw.get("UTR")) except (ValueError, TypeError): thUTR = utr_def output = self.temporary_path(fname=tinput.name+'_annotated.txt') _fields = tinput.fields+['gene', 'location_type', 'distance'] tout = track(output, format='txt', fields=_fields) tout.make_header("#"+"\t".join(tout.fields)) for chrom in assembly.chrnames: tout.write(getNearestFeature( tinput.read(selection=chrom), assembly.gene_track(chrom), thPromot, thInter, thUTR), mode='append') tout.close() self.new_file(output, 'table') return self.display_time()
def __call__(self, **kw): func = kw.get('function', "log2") l_track = kw.get('SigMulti', {}).get('track', []) if not isinstance(l_track, list): l_track = [l_track] outall = [] for tname in l_track: tinput = track(tname) if 'score' not in tinput.fields: continue format = kw.get('format', tinput.format) out_name = tinput.name + '_' + func + '.' + format outtemp = self.temporary_path(out_name) out_track = track(outtemp, chrmeta=tinput.chrmeta) filtered = score_threshold(tinput, strict=(func[:3] == "log")) out_track.write(apply(filtered, 'score', eval(func)), mode='write') out_track.close() outall.append(outtemp) tinput.close() if len(outall) == 1: self.new_file(outall[0], 'converted_track') elif len(outall) > 1: tar_name = self.temporary_path(fname="numeric_operation_out.tgz") tar = tarfile.open(tar_name, "w:gz") [tar.add(f, arcname=os.path.basename(f)) for f in outall] tar.close() self.new_file(tar_name, 'converted_track_tar') return self.display_time()
def __call__(self, **kw): tinput = track(kw.get('track'), chrmeta=kw.get('assembly') or None) outformat = kw.get('format',tinput.format) wsize = int(kw.get('window_size', size_def) or 10) wstep = int(kw.get('window_step', step_def) or 1) featurewise = kw.get('by_feature', False) if isinstance(featurewise, basestring): featurewise = (featurewise.lower() in ['1', 'true', 't','on']) output = self.temporary_path(fname=tinput.name+'_smoothed', ext=outformat) if featurewise: outfields = tinput.fields datatype = "qualitative" else: outfields = ["chr","start", "end", "score"] datatype = "quantitative" tout = track(output, format=outformat, fields=outfields, chrmeta=tinput.chrmeta, info={'datatype': datatype}) for chrom in tout.chrmeta.keys(): s = window_smoothing( tinput.read(selection=chrom, fields=outfields), window_size=wsize, step_size=wstep, featurewise=featurewise) tout.write(s, chrom=chrom, clip=True) tout.close() self.new_file(output, 'smoothed_track') return self.display_time()
def __call__(self, **kw): # Create a track with the whole chromosome chrmeta = _get_chrmeta(**kw) sig0 = track(kw['TrackMulti']['tracks'][0]) fields = sig0.fields format = sig0.format is_chr = 'chr' in fields _f0 = ('chr','start','end') if is_chr else ('start','end') _f1 = [f for f in fields if f not in _f0] whole_chr = [] if is_chr: for chr in chrmeta: whole_chr.append( (chr,0,chrmeta[chr]['length'])+('0',)*len(_f1) ) else: fields = [f for f in fields if f not in ['start','end']] fields = ['start','end']+fields for chr in chrmeta: whole_chr.append( (0,chrmeta[chr]['length'])+('0',)*len(_f1) ) whole_chr = FeatureStream(whole_chr,fields=fields) temp = self.temporary_path()+'.'+format with track(temp,fields=fields) as wc: wc.write(whole_chr) kw['TrackMulti']['tracks'] = [temp] + kw['TrackMulti']['tracks'] output = self.temporary_path(fname='combined.') output = _combine(self._func,output,**kw) self.new_file(output, 'combined') return self.display_time()
def test_chr_loop(self): tempfile = os.path.join(path, 'temp6.txt') t = track(self.bed) out = track(tempfile, fields=t.fields) for chr in ['chrII', 'chrIII', 'chrIV']: s = t.read(chr) out.write(s)
def __call__(self, **kw): # Create a track with the whole chromosome chrmeta = _get_chrmeta(**kw) sig0 = track(kw['TrackMulti']['tracks'][0]) fields = sig0.fields format = sig0.format is_chr = 'chr' in fields _f0 = ('chr', 'start', 'end') if is_chr else ('start', 'end') _f1 = [f for f in fields if f not in _f0] whole_chr = [] if is_chr: for chr in chrmeta: whole_chr.append((chr, 0, chrmeta[chr]['length']) + ('0', ) * len(_f1)) else: fields = [f for f in fields if f not in ['start', 'end']] fields = ['start', 'end'] + fields for chr in chrmeta: whole_chr.append((0, chrmeta[chr]['length']) + ('0', ) * len(_f1)) whole_chr = FeatureStream(whole_chr, fields=fields) temp = self.temporary_path() + '.' + format with track(temp, fields=fields) as wc: wc.write(whole_chr) kw['TrackMulti']['tracks'] = [temp] + kw['TrackMulti']['tracks'] output = self.temporary_path(fname='combined.') output = _combine(self._func, output, **kw) self.new_file(output, 'combined') return self.display_time()
def run_wellington( ex, tests, names, assembly, via, logfile ): futures = {} logfile.write("Running Wellington:\n");logfile.flush() wellout = {} for nbam,bed_bam in enumerate(tests): name = names['tests'][nbam] wellout[name] = [] tbed = track(bed_bam[0]) for chrom in assembly.chrnames: _chrombed = unique_filename_in() with track(_chrombed,format="bed",fields=tbed.fields) as _tt: if len(bed_bam) > 2: _neighb = neighborhood( tbed.read(chrom), before_start=bed_bam[2], after_end=bed_bam[2] ) else: _neighb = tbed.read(chrom) _tt.write(fusion(_neighb),clip=True) if os.path.getsize(_chrombed) > 0: futures[(chrom,name)] = wellington.nonblocking(ex, _chrombed, bed_bam[1], via=via, memory=8) for chro_name, _fut in futures.iteritems(): chrom, name = chro_name logfile.write(name[1]+" "+chrom+", ");logfile.flush() wellout[name].append(_fut.wait()) logfile.write("\n");logfile.flush() bedlist = save_wellington(ex, wellout, assembly.chrmeta) return bedlist
def test_chr_loop(self): tempfile = os.path.join(path,'temp6.txt') t = track(self.bed) out = track(tempfile, fields=t.fields) for chr in ['chrII','chrIII','chrIV']: s = t.read(chr) out.write(s)
def __call__(self, **kw): func = kw.get('function',"log2") #l_track = kw.get('SigMulti', {}).get('track',[]) l_track = kw.get('track',[]) if not isinstance(l_track, list): l_track = [l_track] outall = [] for tname in l_track : tinput = track(tname) if 'score' not in tinput.fields: continue format = kw.get('output',tinput.format) out_name = tinput.name+'_'+func+'.'+format outtemp = self.temporary_path(out_name) out_track = track(outtemp,chrmeta=tinput.chrmeta) filtered = score_threshold(tinput, strict=(func[:3] == "log")) out_track.write(apply(filtered,'score',eval(func)), mode='write') out_track.close() outall.append(outtemp) tinput.close() if len(outall) == 1: self.new_file(outall[0], 'converted_track') elif len(outall) > 1: tar_name = self.temporary_path(fname="numeric_operation_out.tgz") tar = tarfile.open(tar_name, "w:gz") [tar.add(f,arcname=os.path.basename(f)) for f in outall] tar.close() self.new_file(tar_name, 'converted_track_tar') return self.display_time()
def __call__(self, **kw): # Set assembly assembly_id = kw.get('assembly') chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta # Set features track features = track(kw['features'], chrmeta=chrmeta or None) chrmeta = features.chrmeta # Set filter track filter = track(kw.get('filter'), chrmeta=chrmeta or None) # Main format = kw.get('format', features.format) output = self.temporary_path(fname=features.name + '_filtered.' + format) tout = track(output, format, fields=filter.fields, chrmeta=chrmeta, info={'datatype': 'qualitative'}) for chrom in chrmeta: tout.write(overlap(features.read(chrom), filter.read(chrom)), chrom=chrom, clip=True) tout.close() self.new_file(output, 'filtered') return self.display_time()
def test_get_chrmeta(self): t = track(self.bed,chrmeta=self.assembly) self.assertEqual(t.chrmeta['chrV'],{'length':576869, 'ac':'2508_NC_001137.2'}) # "guess" t = track(self.bed,chrmeta="guess") self.assertEqual(t.chrmeta, {'chrII':{'length':607135}, 'chrIII':{'length':178216}, 'chrIV':{'length':1402556}} )
def create_tracks(ex, outall, sample_names, assembly): """Write BED tracks showing SNPs found in each sample.""" infields = ['chromosome','position','reference']+sample_names+['gene','location_type','distance'] intrack = track(outall, format='text', fields=infields, chrmeta=assembly.chrmeta, intypes={'position':int}) instream = intrack.read(fields=infields[:-3]) outtracks = {} for sample_name in sample_names: out = unique_filename_in()+'.bed.gz' t = track(out,fields=['name']) t.make_header(name=sample_name+"_SNPs") outtracks[sample_name] = (t,out) def _row_to_annot(x,ref,n): if x[3+n][0] == ref: return None else: return "%s>%s"%(ref,x[3+n][0]) for x in instream: coord = (x[0],x[1]-1,x[1]) ref = x[2] snp = dict((name, _row_to_annot(x,ref,n)) for n,name in enumerate(sample_names)) for name, tr in outtracks.iteritems(): if snp[name]: tr[0].write([coord+(snp[name],)],mode='append') for name, tr in outtracks.iteritems(): tr[0].close() description = set_file_descr(name+"_SNPs.bed.gz",type='bed',step='tracks',gdv='1',ucsc='1') ex.add(tr[1], description=description)
def run_wellington(ex, tests, names, assembly, via, logfile): futures = {} logfile.write("Running Wellington:\n") logfile.flush() wellout = {} for nbam, bed_bam in enumerate(tests): name = names['tests'][nbam] wellout[name] = [] tbed = track(bed_bam[0]) for chrom in assembly.chrnames: _chrombed = unique_filename_in() with track(_chrombed, format="bed", fields=tbed.fields) as _tt: if len(bed_bam) > 2: _neighb = neighborhood(tbed.read(chrom), before_start=bed_bam[2], after_end=bed_bam[2]) else: _neighb = tbed.read(chrom) _tt.write(fusion(_neighb), clip=True) if os.path.getsize(_chrombed) > 0: futures[(chrom, name)] = wellington.nonblocking(ex, _chrombed, bed_bam[1], via=via, memory=8) for chro_name, _fut in futures.iteritems(): chrom, name = chro_name logfile.write(name[1] + " " + chrom + ", ") logfile.flush() wellout[name].append(_fut.wait()) logfile.write("\n") logfile.flush() bedlist = save_wellington(ex, wellout, assembly.chrmeta) return bedlist
def __call__(self, **kw): tinput = track(kw.get("track"), chrmeta=kw.get("assembly") or None) outformat = kw.get("output", tinput.format) wsize = int(kw.get("window_size", size_def) or 10) wstep = int(kw.get("window_step", step_def) or 1) featurewise = kw.get("by_feature", False) if isinstance(featurewise, basestring): featurewise = featurewise.lower() in ["1", "true", "t", "on"] output = self.temporary_path(fname=tinput.name + "_smoothed", ext=outformat) if featurewise: outfields = tinput.fields datatype = "qualitative" else: outfields = ["chr", "start", "end", "score"] datatype = "quantitative" tout = track(output, format=outformat, fields=outfields, chrmeta=tinput.chrmeta, info={"datatype": datatype}) for chrom in tout.chrmeta.keys(): s = window_smoothing( tinput.read(selection=chrom, fields=outfields), window_size=wsize, step_size=wstep, featurewise=featurewise, ) tout.write(s, chrom=chrom, clip=True) tout.close() self.new_file(output, "smoothed_track") return self.display_time()
def fimo(motifs,fasta,qval=True): # Run Fimo if qval: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.01 --qv-thresh" else: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.000001" cmd = "fimo " + options + " %s %s" % (motifs, fasta) print "Running >>",cmd os.system(cmd) os.system("sort -k2,2n -k3,3n -k4,4n fimo_out/fimo.txt > fimo.txt") # Bed output t = track('fimo.txt', fields=["name","chr","start","end","strand","score","p-value","q-value","sequence"]) t.fields = ["name","chr","start","end","strand","a","score","q","sequence"] s = t.read() s = select(s,['chr','start','end','name','score','strand']) s = apply(s,'chr',lambda x:x.split('|')[1]) s = sorted_stream(s) s = cobble(s) s = apply(s,'name',lambda x:'|'.join(list(set(x.split('|'))))) outname = 'fimo.bed' bed = track(outname,fields=s.fields) bed.make_header(name="TSS_motifs", description="Motifs +-XKb around TSS", mode='overwrite') bed.write(s) if os.path.exists("fimo_out"): shutil.rmtree("fimo_out")
def fimo(motifs, fasta, qval=True): # Run Fimo if qval: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.01 --qv-thresh" else: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.000001" cmd = "fimo " + options + " %s %s" % (motifs, fasta) print "Running >>", cmd os.system(cmd) os.system("sort -k2,2n -k3,3n -k4,4n fimo_out/fimo.txt > fimo.txt") # Bed output t = track('fimo.txt', fields=[ "name", "chr", "start", "end", "strand", "score", "p-value", "q-value", "sequence" ]) t.fields = [ "name", "chr", "start", "end", "strand", "a", "score", "q", "sequence" ] s = t.read() s = select(s, ['chr', 'start', 'end', 'name', 'score', 'strand']) s = apply(s, 'chr', lambda x: x.split('|')[1]) s = sorted_stream(s) s = cobble(s) s = apply(s, 'name', lambda x: '|'.join(list(set(x.split('|'))))) outname = 'fimo.bed' bed = track(outname, fields=s.fields) bed.make_header(name="TSS_motifs", description="Motifs +-XKb around TSS", mode='overwrite') bed.write(s) if os.path.exists("fimo_out"): shutil.rmtree("fimo_out")
def __call__(self, **kw): assembly = kw.get('assembly') or 'guess' signals_plus = kw.get('SigMultiP',{}).get('signals_plus', []) if not isinstance(signals_plus, list): signals_plus = [signals_plus] signals_minus = kw.get('SigMultiM',{}).get('signals_minus', []) if not isinstance(signals_minus, list): signals_minus = [signals_minus] features = kw.get('FeatMulti',{}).get('features', []) if not isinstance(features, list): features = [features] sptracks = [track(sig,chrmeta=assembly) for sig in signals_plus if os.path.exists(sig)] smtracks = [track(sig,chrmeta=assembly) for sig in signals_minus if os.path.exists(sig)] ftracks = [track(feat,chrmeta=assembly) for feat in features if os.path.exists(feat)] snames = [t.name for t in sptracks+smtracks+ftracks] if len(sptracks) > 0: chrmeta = sptracks[0].chrmeta elif len(smtracks) > 0: chrmeta = smtracks[0].chrmeta elif len(features) > 0: chrmeta = ftracks[0].chrmeta else: raise ValueError("No data provided") if assembly in [x[0] for x in genrep.GenRep().assemblies_available()]: chrnames = genrep.Assembly(assembly).chrnames else: chrnames = [x[1] for x in sorted([(v['length'],c) for c,v in chrmeta.iteritems()],reverse=True)] pdf = self.temporary_path(fname='genome_graph.pdf') _fs = ['chr','start','end','score'] _ff = ['chr','start','end','name'] genomeGraph([(c,chrmeta[c]['length']) for c in chrnames], [sig.read(fields=_fs) for sig in sptracks], [sig.read(fields=_fs) for sig in smtracks], [feat.read(fields=_ff) for feat in ftracks], output=pdf, new=True, last=True, legend=snames) self.new_file(pdf, 'genome_graph') return self.display_time()
def test_make_header(self): t = track(self.bed) o = track("temp.bed",fields=t.fields) t.open() o.make_header(t.filehandle.readline()) # copy the header t.filehandle.seek(0) o.write(t.read()) o.close() self.assertEqual(list(t.read()),list(o.read()))
def __call__(self, **kw): _f = ['start', 'end', 'score'] format = kw.get("format") or "sql" bamfiles = kw.get('BamMulti', {}).get('bamfiles', []) if not isinstance(bamfiles, (tuple, list)): bamfiles = [bamfiles] bamfiles = [track(bam) for bam in bamfiles] all_tracks = [] pdf = self.temporary_path(fname='Paired_end_plots.pdf') robjects.r('pdf("%s",paper="a4",height=11,width=8)' % pdf) midpoint = kw.get("midpoint", False) if isinstance(midpoint, basestring): midpoint = (midpoint.lower() in ['1', 'true', 't', 'on']) plot_only = kw.get("plot_only", False) if isinstance(plot_only, basestring): plot_only = (plot_only.lower() in ['1', 'true', 't', 'on']) for bam in bamfiles: if not plot_only: tname = "%s_frags.%s" % (bam.name, format) outname = self.temporary_path(fname=tname) all_tracks.append(outname) trout = track(outname, fields=_f, chrmeta=bam.chrmeta, info={ 'datatype': 'quantitative', 'PE_midpoint': midpoint }) self.frag_rep = {} self.frag_size = {} self.nb_frag = 0 for chrom, cval in bam.chrmeta.iteritems(): self._compute_stats(bam.fetch(chrom, 0, cval['length'])) if not plot_only: trout.write(bam.PE_fragment_size(chrom, midpoint=midpoint), fields=_f, chrom=chrom) if not plot_only: trout.close() if self.nb_frag > 1: self._plot_stats(bam.name) else: raise ValueError("No paired-end found in %s" % bam.name) robjects.r('dev.off()') if not plot_only: if len(all_tracks) > 1: tarname = self.temporary_path(fname='PE_fragment_tracks.tgz') tar_tracks = tarfile.open(tarname, "w:gz") [ tar_tracks.add(f, arcname=os.path.basename(f)) for f in all_tracks ] tar_tracks.close() self.new_file(tarname, 'fragment_track_tar') else: self.new_file(all_tracks[0], 'fragment_track') self.new_file(pdf, 'statistics_plot') return self.display_time()
def test_make_header(self): t = track(self.bed) o = track("temp.bed", fields=t.fields) t.open() o.make_header(t.filehandle.readline()) # copy the header t.filehandle.seek(0) o.write(t.read()) o.close() self.assertEqual(list(t.read()), list(o.read()))
def read(*args, **kw): if len(args) < 1: raise Usage("No input file provided") selection = None if kw['selection']: if kw['selection'].count("{"): jsonargs = json.loads(kw['selection']) for k, v in jsonargs.iteritems(): if isinstance(v, basestring): jsonargs[k] = str(v) selection = dict((str(k), v) for k, v in jsonargs.iteritems()) elif kw['selection'].count(":"): chr, coord = kw['selection'].split(':') start, end = coord.split('-') selection = { 'chr': chr, 'start': (int(start), int(end)), 'end': (int(start), int(end)) } else: selection = str(kw['selection']).split(",") fields = None outformat = None if kw['fields']: fields = str(kw['fields']).split(",") if kw['output'] is None: output = sys.stdout outformat = "txt" else: output = open(kw['output'], 'w') chrmeta = _get_chrmeta(**kw) for infile in args: intrack = track.track(infile, format=kw['format'], chrmeta=chrmeta) if kw['description']: if intrack.info: fileinfo = ",".join( ["%s=%s" % (k, v) for k, v in intrack.info.iteritems()]) else: fileinfo = 'None' chromlist = ",".join(sorted(intrack.chrmeta.keys())) or "None" fields = ",".join(intrack.fields) output.write(\ """# ***************************************** # File '%s' (%s): # Infos: %s # Chromosomes: %s # Fields: %s # ***************************************** """ %(os.path.basename(infile), intrack.format, fileinfo, chromlist, fields)) continue with track.track(output, format=outformat, fields=fields) as _tout: _tout.write(intrack.read(selection=selection, fields=fields)) intrack.close() try: output.close() except IOError: pass # if stdout return 0
def merge_junc_files(trackList,assembly): out = track('all.junc',format='txt',fields=['chr','start','end','strand','score']) from bbcflib.genrep import Assembly a = Assembly(assembly) for c in a.chromosomes: tl = [track(t,fields=['chr','start','end','strand','score'],format='txt').read(str(c[0])+'_'+c[1]+'.'+str(c[2])) for t in trackList] #all = concatenate(tl,remove_duplicates=True) all = concatenate(tl,group_by=['chr','start','end'],aggregate={'score':lambda x:sum(x)}) out.write(all,mode='append')
def sort(*args,**kw): if len(args) < 1: raise Usage("No input file provided") chrmeta = _get_chrmeta(**kw) for infile in args: intrack = track.track(infile,format=kw['format'],chrmeta=chrmeta) outname = kw['output'] or intrack.name+'_sorted.'+intrack.format outtrack = track.track(outname, chrmeta=intrack.chrmeta) instream = intrack.read() s = sorted_stream(instream, chrnames=json.loads(kw['chromosomes'])) outtrack.write(s) intrack.close() return 0
def sort(*args, **kw): if len(args) < 1: raise Usage("No input file provided") chrmeta = _get_chrmeta(**kw) for infile in args: intrack = track.track(infile, format=kw['format'], chrmeta=chrmeta) outname = kw['output'] or intrack.name + '_sorted.' + intrack.format outtrack = track.track(outname, chrmeta=intrack.chrmeta) instream = intrack.read() s = sorted_stream(instream, chrnames=json.loads(kw['chromosomes'])) outtrack.write(s) intrack.close() return 0
def test_read(self): t = track(self.bed) s = t.read() self.assertIsInstance(s, FeatureStream) x = s.next() y = t.readline() self.assertEqual(x,y) # zipped file t = track(self.bed+'.gz') s = t.read(); s.next() self.assertIsInstance(s, FeatureStream)
def __call__(self, **kw): if kw.get('input_type') == 'Table': table = kw.get('table') assert os.path.exists(str(table)), "File not found: '%s'" % table with open(table) as t: colnames = t.readline() _f = colnames.strip().split() nscores = len(_f)-1 groups = len(list(set([x.split('.')[0] for x in _f]))) if nscores == 2: # 3 columns, cols 2 and 3 contain the scores sample1 = [2] sample2 = [3] elif len(groups) == 2: # more columns, look if there are two groups of prefixes sample1 = [_f.index(x) for x in _f if x.split('.')==groups[0]] sample2 = [_f.index(x) for x in _f if x.split('.')==groups[1]] else: # not implemented yet, ask the user to choose the columns he wants? Checkboxes... raise ValueError("For the moment, either have only 2 columns of scores, \ or use names of the form <group_name>.<run_id>") else: # Use QuantifyTablePlugin to build a table from score tracks from QuantifyTable import QuantifyTablePlugin # Set QuantifyTablePlugin options kw['score_op'] = 'sum' kw['format'] = 'txt' #signals1 = kw['Group1']['signals1'] signals1 = kw['signals1'] #signals2 = kw['Group2']['signals2'] signals2 = kw['signals2'] if not isinstance(signals1,(list,tuple)): signals1 = [signals1] if not isinstance(signals2,(list,tuple)): signals2 = [signals2] kw['signals'] = signals1 + signals2 signals = kw['signals'] nscores = len(signals) qtable = QuantifyTablePlugin().quantify(**kw) # Remove useless fields and add header based on file names qtable = track(qtable, format='txt', fields=['chr','start','end','name']+ \ ['score'+str(i) for i in range(nscores)]) table = self.temporary_path('scores_table.txt') _f = ['score'+str(i) for i in range(nscores)] strack = track(table, fields=['name']+_f) signal_tracks = [track(s) for s in signals] signames = [s.name for s in signal_tracks] strack.write([('Name',signames[0],signames[1])]) strack.write(qtable.read(fields=strack.fields)) sample1 = range(len(signals1)) sample2 = range(nscores-len(signals1)) output_filename = MAplot(table, cols={1:sample1, 2:sample2}) output = self.temporary_path(fname='maplot.png') shutil.move(output_filename,output) self.new_file(output, 'MA-plot') return self.display_time()
def merge(*args,**kw): if not(kw['forward'] and os.path.exists(kw['forward'])): raise Usage("Specify a valid forward strand density file with -f.") if not(kw['reverse'] and os.path.exists(kw['reverse'])): raise Usage("Specify a valid reverse strand density file with -r.") if not(kw['output']): raise Usage("Specify the output file name.") def _shift(stream,shift): istart = stream.fields.index('start') iend = stream.fields.index('end') i1 = min(istart,iend) i2 = max(istart,iend) def _apply_shift(x): return x[:i1]+(x[i1]+shift,)+x[i1+1:i2]+(x[i2]+shift,)+x[i2+1:] return track.FeatureStream((_apply_shift(x) for x in stream), fields=stream.fields) fields = ['chr','start','end','score'] chrmeta = _get_chrmeta(**kw) tfwd = track.track(kw['forward'],format=kw['formatf'],chrmeta=chrmeta) trev = track.track(kw['reverse'],format=kw['formatr'],chrmeta=chrmeta) if tfwd.chrmeta: chrmeta = tfwd.chrmeta elif trev.chrmeta: chrmeta = trev.chrmeta else: raise Usage("Specify an assembly with -a.") shiftval = int(kw['shift']) if shiftval < 0: slim = 300 chrsize,chrom = sorted([(v['length'],k) for k,v in chrmeta.iteritems()],reverse=True)[0] xcor = correlation([tfwd.read(chrom),trev.read(chrom)], (1,chrsize),limits=(-slim,slim)) shiftval = (xcor.argmax()-slim-1)/2 print "Autocorrelation shift=%i, correlation is %f." %(shiftval,xcor.max()) tout = track.track(kw['output'],fields=fields, chrmeta=chrmeta,info={'datatype':'quantitative'}) mode = 'write' method = kw.get("method","mean") for chrom in chrmeta.keys(): tout.write(merge_scores([_shift(tfwd.read(chrom), shiftval), _shift(trev.read(chrom),-shiftval)], method=method), chrom=chrom,mode=mode,clip=True) mode = 'append' tout.close() trev.close() tfwd.close() return 0
def __call__(self,**kw): assembly = kw.get('assembly') or 'guess' t1 = track(kw['numerator'],chrmeta=assembly) t2 = track(kw['denominator'],chrmeta=assembly) format = kw.get('output') or t1.format wsize = int(kw.get('window_size') or size_def) self.log = kw.get('log',False) if isinstance(self.log, basestring): self.log = (self.log.lower() in ['1', 'true', 't','on']) try: self.pseudo = float(kw.get('pseudo')) except: self.pseudo = pseudo_def self.baseline = -log(self.pseudo,2) try: self.threshold = float(kw.get('threshold')) except: self.threshold = threshold_def distribution = kw.get('distribution',False) if isinstance(distribution, basestring): distribution = (distribution.lower() in ['1', 'true', 't','on']) if distribution: genome_length = sum((v['length'] for v in t1.chrmeta.values())) self.shifts = list(poisson(float(genome_length)/float(self.sample_num),self.sample_num)) self.ratios = [] output = self.temporary_path(fname='ratios_%s-%s.%s'%(t1.name,t2.name,format)) with track(output, chrmeta=t1.chrmeta, fields=t1.fields, info={'datatype': 'quantitative', 'log': self.log, 'pseudocounts': self.pseudo, 'threshold': self.threshold, 'window_size': wsize}) as tout: for chrom,vchr in t1.chrmeta.iteritems(): if wsize > 1: s1 = window_smoothing(t1.read(chrom),window_size=wsize,step_size=1,featurewise=False) s2 = window_smoothing(t2.read(chrom),window_size=wsize,step_size=1,featurewise=False) else: s1 = t1.read(chrom) s2 = t2.read(chrom) s3 = merge_scores([s1,s2],method=self._divide) if distribution: s3 = FeatureStream(self._sample_stream(s3,vchr['length']),fields=s3.fields) tout.write(s3, chrom=chrom, clip=True) self.new_file(output, 'ratios') if distribution: pdf = self.temporary_path(fname='%s-%s_ratios_distribution.pdf'%(t1.name,t2.name)) density_boxplot(self.ratios,output=pdf, name="%s/%s (median=%.2f)" %(t1.name,t2.name,median(self.ratios))) self.new_file(pdf, 'boxplot') return self.display_time()
def test_read(self): t = track(self.bed) s = t.read() self.assertIsInstance(s, FeatureStream) x = s.next() y = t.readline() self.assertEqual(x, y) # zipped file t = track(self.bed + '.gz') s = t.read() s.next() self.assertIsInstance(s, FeatureStream)
def __call__(self, **kw): def _shift(stream, shift): istart = stream.fields.index('start') iend = stream.fields.index('end') i1 = min(istart, iend) i2 = max(istart, iend) def _apply_shift(x): return x[:i1] + (x[i1] + shift,) + x[i1 + 1:i2] + (x[i2] + shift,) + x[i2 + 1:] return FeatureStream((_apply_shift(x) for x in stream), fields=stream.fields) assembly = kw.get('assembly') or 'guess' tfwd = track(kw.get('forward'), chrmeta=assembly) trev = track(kw.get('reverse'), chrmeta=assembly) chrmeta = tfwd.chrmeta shiftval = int(kw.get('shift', 0)) if shiftval < 0: # Determine shift automatically shiftval = None xcor_lim = 300 for chrom, v in chrmeta.iteritems(): chrsize = v['length'] xcor_lim = min(xcor_lim, 0.01 * chrsize) xcor = correlation([tfwd.read(chrom), trev.read(chrom)], regions=(1, chrsize), limits=(-xcor_lim, xcor_lim)) max_xcor_idx = xcor.argmax() if xcor[max_xcor_idx] > 0.2: shiftval = (max_xcor_idx - xcor_lim - 1)/2 break if not shiftval: raise ValueError("Unable to detect shift automatically. Must specify a shift value.") output = self.temporary_path(fname=tfwd.name+'-'+trev.name+'_merged', ext=kw.get('format',tfwd.format)) outfields = [f for f in tfwd.fields if f in trev.fields] tout = track(output, chrmeta=chrmeta, fields=outfields, info={'datatype': 'quantitative', 'shift': shiftval}) mode = 'write' method = kw.get("method","mean") for chrom in chrmeta.keys(): tout.write(merge_scores([_shift(tfwd.read(selection=chrom), shiftval), _shift(trev.read(selection=chrom), -shiftval)], method=method), chrom=chrom, mode=mode, clip=True) mode = 'append' tout.close() trev.close() tfwd.close() self.new_file(output, 'density_merged') return self.display_time()
def __call__(self, **kw): if kw.get('input_type') == 'Table': table = kw.get('table') assert os.path.exists(str(table)), "File not found: '%s'" % table with open(table) as t: colnames = t.readline() _f = colnames.strip().split() nscores = len(_f)-1 groups = len(list(set([x.split('.')[0] for x in _f]))) if nscores == 2: # 3 columns, cols 2 and 3 contain the scores sample1 = [2] sample2 = [3] elif len(groups) == 2: # more columns, look if there are two groups of prefixes sample1 = [_f.index(x) for x in _f if x.split('.')==groups[0]] sample2 = [_f.index(x) for x in _f if x.split('.')==groups[1]] else: # not implemented yet, ask the user to choose the columns he wants? Checkboxes... raise ValueError("For the moment, either have only 2 columns of scores, \ or use names of the form <group_name>.<run_id>") else: # Use QuantifyTablePlugin to build a table from score tracks from QuantifyTable import QuantifyTablePlugin # Set QuantifyTablePlugin options kw['score_op'] = 'sum' kw['format'] = 'txt' signals1 = kw['Group1']['signals1'] signals2 = kw['Group2']['signals2'] if not isinstance(signals1,(list,tuple)): signals1 = [signals1] if not isinstance(signals2,(list,tuple)): signals2 = [signals2] kw['signals'] = signals1 + signals2 signals = kw['signals'] nscores = len(signals) qtable = QuantifyTablePlugin().quantify(**kw) # Remove useless fields and add header based on file names qtable = track(qtable, format='txt', fields=['chr','start','end','name']+ \ ['score'+str(i) for i in range(nscores)]) table = self.temporary_path('scores_table.txt') _f = ['score'+str(i) for i in range(nscores)] strack = track(table, fields=['name']+_f) signal_tracks = [track(s) for s in signals] signames = [s.name for s in signal_tracks] strack.write([('Name',signames[0],signames[1])]) strack.write(qtable.read(fields=strack.fields)) sample1 = range(len(signals1)) sample2 = range(nscores-len(signals1)) output_filename = MAplot(table, cols={1:sample1, 2:sample2}) output = self.temporary_path(fname='maplot.png') shutil.move(output_filename,output) self.new_file(output, 'MA-plot') return self.display_time()
def __call__(self, **kw): def _shift(stream, shift): istart = stream.fields.index('start') iend = stream.fields.index('end') i1 = min(istart, iend) i2 = max(istart, iend) def _apply_shift(x): return x[:i1] + (x[i1] + shift,) + x[i1 + 1:i2] + (x[i2] + shift,) + x[i2 + 1:] return FeatureStream((_apply_shift(x) for x in stream), fields=stream.fields) assembly = kw.get('assembly') or 'guess' tfwd = track(kw.get('forward'), chrmeta=assembly) trev = track(kw.get('reverse'), chrmeta=assembly) chrmeta = tfwd.chrmeta shiftval = int(kw.get('shift', 0)) if shiftval < 0: # Determine shift automatically shiftval = None xcor_lim = 300 for chrom, v in chrmeta.iteritems(): chrsize = v['length'] xcor_lim = min(xcor_lim, 0.01 * chrsize) xcor = correlation([tfwd.read(chrom), trev.read(chrom)], regions=(1, chrsize), limits=(-xcor_lim, xcor_lim)) max_xcor_idx = xcor.argmax() if xcor[max_xcor_idx] > 0.2: shiftval = (max_xcor_idx - xcor_lim - 1)/2 break if not shiftval: raise ValueError("Unable to detect shift automatically. Must specify a shift value.") output = self.temporary_path(fname=tfwd.name+'-'+trev.name+'_merged', ext=kw.get('format',tfwd.format)) tout = track(output, chrmeta=chrmeta, info={'datatype': 'quantitative', 'shift': shiftval}) mode = 'write' method = kw.get("method","mean") for chrom in chrmeta.keys(): tout.write(merge_scores([_shift(tfwd.read(selection=chrom), shiftval), _shift(trev.read(selection=chrom), -shiftval)], method=method), chrom=chrom, mode=mode, clip=True) mode = 'append' tout.close() trev.close() tfwd.close() self.new_file(output, 'density_merged') return self.display_time()
def __call__(self, **kw): assembly = kw.get('assembly') or 'guess' signals_plus = kw.get('SigMultiP', {}).get('signals_plus', []) if not isinstance(signals_plus, list): signals_plus = [signals_plus] signals_minus = kw.get('SigMultiM', {}).get('signals_minus', []) if not isinstance(signals_minus, list): signals_minus = [signals_minus] features = kw.get('FeatMulti', {}).get('features', []) if not isinstance(features, list): features = [features] sptracks = [ track(sig, chrmeta=assembly) for sig in signals_plus if os.path.exists(sig) ] smtracks = [ track(sig, chrmeta=assembly) for sig in signals_minus if os.path.exists(sig) ] ftracks = [ track(feat, chrmeta=assembly) for feat in features if os.path.exists(feat) ] snames = [t.name for t in sptracks + smtracks + ftracks] if len(sptracks) > 0: chrmeta = sptracks[0].chrmeta elif len(smtracks) > 0: chrmeta = smtracks[0].chrmeta elif len(features) > 0: chrmeta = ftracks[0].chrmeta else: raise ValueError("No data provided") if assembly in [x[0] for x in genrep.GenRep().assemblies_available()]: chrnames = genrep.Assembly(assembly).chrnames else: chrnames = [ x[1] for x in sorted([(v['length'], c) for c, v in chrmeta.iteritems()], reverse=True) ] pdf = self.temporary_path(fname='genome_graph.pdf') _fs = ['chr', 'start', 'end', 'score'] _ff = ['chr', 'start', 'end', 'name'] genomeGraph([(c, chrmeta[c]['length']) for c in chrnames], [sig.read(fields=_fs) for sig in sptracks], [sig.read(fields=_fs) for sig in smtracks], [feat.read(fields=_ff) for feat in ftracks], output=pdf, new=True, last=True, legend=snames) self.new_file(pdf, 'genome_graph') return self.display_time()
def quantify(self,**kw): feature_type = kw.get('feature_type', 0) if str(feature_type) in [str(x[0]) for x in ftypes]: feature_type = int(feature_type) func = str(kw.get('score_op', 'mean')) assembly_id = kw.get('assembly') format = kw.get('output') or 'txt' chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not(feature_type in ftypes[3]): raise ValueError("Please specify an assembly") #signals = kw['SigMulti'].get('signals',[]) signals = kw.get('signals',[]) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] if feature_type in ftypes[0]: features = genes elif feature_type in ftypes[1]: prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def), 'after_start': int(kw.get('downstream') or prom_down_def), 'on_strand': True} features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type in ftypes[2]: features = exons elif feature_type in ftypes[3]: assert os.path.exists(str(kw.get('features'))), "Features file not found: '%s'" % kw.get("features") _t = track(kw['features'], chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Take feature_type in %s." %ftypes) output = self.temporary_path(fname='quantification.'+format) if len(signals) > 1: _f = ["score%i"%i for i in range(len(signals))] else: _f = ["score"] tout = track(output, format, fields=['chr','start','end','name']+_f, chrmeta=chrmeta, info={'datatype':'qualitative'}) if format == 'txt': header = ['#chr','start','end','name']+[s.name for s in signals] tout.make_header("\t".join(header)) for chrom in chrmeta: sread = [sig.read(chrom) for sig in signals] tout.write(score_by_feature(sread, features(chrom), method=func), chrom=chrom, clip=True, mode="append") return output
def __call__(self, **kw): _f = ['start','end','score'] format = kw.get('output') or "sql" #bamfiles = kw.get('BamMulti',{}).get('bamfiles',[]) bamfiles = kw.get('bamfiles',[]) if not isinstance(bamfiles, (tuple,list)): bamfiles = [bamfiles] bamfiles = [track(bam) for bam in bamfiles] all_tracks = [] pdf = self.temporary_path(fname='Paired_end_plots.pdf') robjects.r('pdf("%s",paper="a4",height=11,width=8)' %pdf) midpoint = kw.get("midpoint",False) if isinstance(midpoint, basestring): midpoint = (midpoint.lower() in ['1', 'true', 't','on']) plot_only = kw.get("plot_only",False) if isinstance(plot_only, basestring): plot_only = (plot_only.lower() in ['1', 'true', 't','on']) for bam in bamfiles: if not plot_only: tname = "%s_frags.%s" %(bam.name.split("/")[-1], format) outname = self.temporary_path(fname=tname) all_tracks.append(outname) trout = track(outname, fields=_f, chrmeta=bam.chrmeta, info={'datatype': 'quantitative', 'PE_midpoint': midpoint}) self.frag_rep = {} self.frag_size = {} self.nb_frag = 0 for chrom,cval in bam.chrmeta.iteritems(): self._compute_stats(bam.fetch(chrom, 0, cval['length'])) if not plot_only: trout.write( bam.PE_fragment_size(chrom,midpoint=midpoint), fields=_f, chrom=chrom ) if not plot_only: trout.close() if self.nb_frag > 1: self._plot_stats(bam.name) else: raise ValueError("No paired-end found in %s" %bam.name) robjects.r('dev.off()') if not plot_only: if len(all_tracks)>1: tarname = self.temporary_path(fname='PE_fragment_tracks.tgz') tar_tracks = tarfile.open(tarname, "w:gz") [tar_tracks.add(f,arcname=os.path.basename(f)) for f in all_tracks] tar_tracks.close() self.new_file(tarname, 'fragment_track_tar') else: self.new_file(all_tracks[0], 'fragment_track') self.new_file(pdf,'statistics_plot') return self.display_time()
def read(*args,**kw): if len(args) < 1: raise Usage("No input file provided") selection = None if kw['selection']: if kw['selection'].count("{"): jsonargs = json.loads(kw['selection']) for k,v in jsonargs.iteritems(): if isinstance(v,basestring): jsonargs[k] = str(v) selection = dict((str(k),v) for k,v in jsonargs.iteritems()) elif kw['selection'].count(":"): chr,coord = kw['selection'].split(':') start,end = coord.split('-') selection = {'chr':chr,'start':(int(start),int(end)),'end':(int(start),int(end))} else: selection = str(kw['selection']).split(",") fields = None outformat = None if kw['fields']: fields = str(kw['fields']).split(",") if kw['output'] is None: output = sys.stdout outformat = "txt" else: output = open(kw['output'],'w') chrmeta = _get_chrmeta(**kw) for infile in args: intrack = track.track(infile,format=kw['format'],chrmeta=chrmeta) if kw['description']: if intrack.info: fileinfo = ",".join(["%s=%s" %(k,v) for k,v in intrack.info.iteritems()]) else: fileinfo = 'None' chromlist = ",".join(sorted(intrack.chrmeta.keys())) or "None" fields = ",".join(intrack.fields) output.write(\ """# ***************************************** # File '%s' (%s): # Infos: %s # Chromosomes: %s # Fields: %s # ***************************************** """ %(os.path.basename(infile), intrack.format, fileinfo, chromlist, fields)) continue with track.track(output,format=outformat,fields=fields) as _tout: _tout.write(intrack.read(selection=selection,fields=fields)) intrack.close() try: output.close() except IOError: pass # if stdout return 0
def test_intersect(self): self.intersect(**self.kw) with track(self.intersect.output_files[0][0]) as t: s = t.read(fields=self.fields) content = list(s) expected = [('chr1', 10, 15, 17.0), ('chr1', 24, 35, 107.0)] self.assertListEqual(content, expected)
def test_bed(self): # as general TextTrack shutil.copy(self.bed, os.path.join(path,'test')) # guess extension from header t = track(os.path.join(path,'test'), format='bed', fields=self.fields) s = t.read(); s.next() self.assertIsInstance(t, BedTrack) self.assertEqual(t.format,'bed') self.assertListEqual(t.fields, self.fields)
def __call__(self, **kw): assembly = genrep.Assembly(kw.get('assembly')) format = kw['format'] if kw['feature_type'] == 'genes': map = assembly.get_gene_mapping() get_info = self.genes_annot elif kw['feature_type'] == 'exons': map = assembly.get_exon_mapping() get_info = self.exons_annot elif kw['feature_type'] == 'transcripts': map = assembly.get_transcript_mapping() get_info = self.trans_annot def _annotate(ids_list): with open(ids_list) as ids_file: for id in ids_file: id = id.strip() if map.get(id): yield get_info(id,map.get(id)) else: yield ('NA','0','0',id,0.0,'0') ids_list = kw.get('ids_list') fields = ['chr','start','end','name','score','strand'] if ids_list: assert os.path.exists(str(ids_list)), "File not found: '%s'" % ids_list fulltrack = FeatureStream(_annotate(ids_list),fields=fields) fname = os.path.splitext(os.path.basename(ids_list))[0] else: fulltrack = FeatureStream((get_info(g,map[g]) for g in map),fields=fields) fname = kw['feature_type'] output = self.temporary_path(fname=fname+'.'+format) out = track(output,chrmeta=assembly) out.write(fulltrack) self.new_file(output, 'fulltrack') return self.display_time()
def __call__(self, **kw): sample = track(kw['sample'],chrmeta="guess") by_chrom = kw.get('by_chrom',False) if isinstance(by_chrom, basestring): by_chrom = (by_chrom.lower() in ['1', 'true', 't','on']) outf = kw.get('output') if outf not in output_list: outf = output_list[0] output = self.temporary_path(fname=sample.name+'_stats.'+outf) if outf == 'txt': out = open(output,"w") else: out = {} if by_chrom: chromlist = sample.chrmeta.keys() else: chromlist = [None] for chrom in chromlist: if outf == 'txt': if chrom: out.write("Chromosome %s\n--------------------\n"%chrom) stats(sample,out=out,selection=chrom) else: out[chrom] = {} stats(sample,out=out[chrom],selection=chrom) if outf == 'txt' and chrom: out.write("\n--------------------\n") if outf == 'txt': out.close() self.new_file(output, 'stats') else: self._plot_pdf(output,out,sample.name) self.new_file(output, 'pdf') return self.display_time()
def test_subtract(self): self.subtract(**self.kw) with track(self.subtract.output_files[0][0]) as t: s = t.read(fields=self.fields) content = list(s) expected = [('chr1', 21, 24, 17.0)] self.assertListEqual(content, expected)
def to_bed(filename,assembly): t = track(filename,fields=['chr','start','end','strand','score'],chrmeta=assembly,format='txt') # Translate chr names s = t.read() s1 = map_chromosomes(s, t.assembly.chromosomes) # Prepare output bed file out = track(filename.rstrip('junc')+'bed', fields=['chr','start','end','name','score','strand']) out.make_header({'name':filename,'description':filename}) mode='append' # Add junction names c = itertools.count() s2 = duplicate(s1,'chr','name') s3 = apply(s2,'name',lambda x: 'junction'+str(c.next())) # Write out.write(s3,mode=mode) out.close()
def get_type(self,filename): """Return whether it is a track with 'intervals' or a 'density'.""" with track(filename) as t: if t.format.lower() in ['bed','sam','bam']: return 'intervals' elif t.format.lower() in ['bedgraph','wig','bigWig','sga']: return 'density'
def test_overlap(self): self.plugin(**{'input_type':'Signal','filter':path+'peaks.bedGraph', 'features':path+'features.bed','feature_type':3,'assembly':'mm9','format':'bed'}) with track(self.plugin.output_files[0][0]) as t: s = t.read() content = list(s) self.assertEqual(len(content),3)
def test_subtract(self): self.subtract(**self.kw) with track(self.subtract.output_files[0][0]) as t: s = t.read(fields=self.fields) content = list(s) expected = [("chr1", 21, 24, 17.0)] self.assertListEqual(content, expected)
def test_quantify_table_text(self): self.plugin(**{'input_type':'Signal', 'SigMulti':{'signals':[path+'KO50.bedGraph', path+'WT50.bedGraph']}, 'features':path+'features.bed', 'feature_type':3, 'assembly':'mm9', 'format':'txt'}) with track(self.plugin.output_files[0][0], fields=["chr","start","end","name","score0","score1"]) as t: s = t.read() content = list(s) self.assertEqual(len(content),9)
def plot_footprint_profile(ex, bedlist, signals, chrnames, groups, logfile): files = dict((gid, {'pdf': "", 'mat': []}) for gid in bedlist.keys()) logfile.write("Plotting footprints:\n") logfile.flush() for gid, motifbed in bedlist.iteritems(): # signals = [track(sig) for sig in siglist[gid]] snames = [sig.name for sig in signals[gid]] tmotif = track(motifbed, format='bed') data = {} numregs = {} for chrom in chrnames: fread = {} for r in tmotif.read(chrom): r2 = r[3].split(":") key = (r2[0], len(r2[1])) if key in fread: fread[key].append(r[1:3]) else: fread[key] = [r[1:3]] for motif, regs in fread.iteritems(): if motif not in data: data[motif] = zeros(shape=(motif[1] + 2 * _plot_flank[1], len(signals[gid]))) numregs[motif] = 0 numregs[motif] += len(regs) tFeat = sorted_stream( segment_features(FeatureStream(regs, fields=['start', 'end']), nbins=motif[1], upstream=_plot_flank, downstream=_plot_flank)) for t in score_by_feature( [s.read(chrom) for s in signals[gid]], tFeat): data[motif][t[2]] += t[3:] files[gid]['pdf'] = unique_filename_in() new = True last = len(data) for motif, dat in data.iteritems(): last -= 1 mname, nbins = motif dat /= float(numregs[motif]) X = range(-_plot_flank[1], _plot_flank[1] + nbins) for k in range(nbins): X[k + _plot_flank[1]] = str(k + 1) ####### Could do a heatmap (sort by intensity)... lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4, 2], output=files[gid]['pdf'], new=new, last=(last == 0), legend=snames, main=mname) new = False _datf = unique_filename_in() with open(_datf, "w") as dff: dff.write("\t".join([""] + [str(x) for x in X]) + "\n") for n, sn in enumerate(snames): dff.write("\t".join([sn] + [str(x) for x in dat[:, n]]) + "\n") files[gid]['mat'].append((mname, _datf)) return files
def test_complement(self): self.complement(**self.kw) with track(self.complement.output_files[0][0]) as t: s = t.read('chr1', fields=self.fields) content = list(s) expected = [('chr1', 0, 8, 0.0), ('chr1', 19, 21, 0.0), ('chr1', 39, 197195432, 0.0)] self.assertListEqual(content, expected)
def _combine(func, output, **kw): chrmeta = _get_chrmeta(**kw) format = kw.get('format') or 'sql' output += format tracks = kw['TrackMulti']['tracks'] if not isinstance(tracks, list): tracks = [tracks] tracks = [track(sig, chrmeta=chrmeta) for sig in tracks] chrmeta = tracks[0].chrmeta tout = track(output, chrmeta=chrmeta, info={'datatype': 'qualitative'}) for chrom in chrmeta: trackList = [sig.read(chrom) for sig in tracks] res = combine(trackList, fn=func) tout.fields = res.fields tout.write(res, chrom=chrom, clip=True) tout.close() return output
def test_skip_header(self): t = track(self.bed) # skips the first line by default (header=None) L1 = len([line for line in t.read()]) t = track(self.bed, header=None) # same L11 = len([line for line in t.read()]) t = track(self.bed, header=True) # skips the first line L111 = len([line for line in t.read()]) t = track(self.bed, header=5) # skips 5 lines L2 = len([line for line in t.read()]) t = track(self.bed, header='track') # skips lines starting with 'track' L3 = len([line for line in t.read()]) t = track(self.bed, header=['track', 'chr']) # skips lines starting with 'track' or 'chr' L4 = len([line for line in t.read()]) t = track(self.bed, header=['track', 'chrII' ]) # skips lines starting with 'track' or 'chrII' L5 = len([line for line in t.read()]) self.assertEqual(L1, L11) self.assertEqual(L1, L111) self.assertEqual(L1 - 4, L2) self.assertEqual(L1, L3) self.assertEqual(L4, 0) self.assertEqual(L1 - 11, L5) t.close()
def test_bed(self): # as general TextTrack shutil.copy(self.bed, os.path.join(path, 'test')) # guess extension from header t = track(os.path.join(path, 'test'), format='bed', fields=self.fields) s = t.read() s.next() self.assertIsInstance(t, BedTrack) self.assertEqual(t.format, 'bed') self.assertListEqual(t.fields, self.fields)
def test_union(self): self.union(**self.kw) with track(self.union.output_files[0][0]) as t: s = t.read(fields=self.fields) content = list(s) expected = [('chr1', 8, 10, 12.0), ('chr1', 10, 15, 17.0), ('chr1', 15, 19, 12.0), ('chr1', 21, 24, 17.0), ('chr1', 24, 35, 107.0), ('chr1', 35, 39, 90.0)] self.assertListEqual(content, expected)
def merge_junc_files(trackList, assembly): out = track('all.junc', format='txt', fields=['chr', 'start', 'end', 'strand', 'score']) from bbcflib.genrep import Assembly a = Assembly(assembly) for c in a.chromosomes: tl = [ track(t, fields=['chr', 'start', 'end', 'strand', 'score'], format='txt').read(str(c[0]) + '_' + c[1] + '.' + str(c[2])) for t in trackList ] #all = concatenate(tl,remove_duplicates=True) all = concatenate(tl, group_by=['chr', 'start', 'end'], aggregate={'score': lambda x: sum(x)}) out.write(all, mode='append')
def test_ratios(self): self.plugin( **{ 'numerator': path + 'KO50.bedGraph', 'denominator': path + 'WT50.bedGraph', 'format': 'bedGraph' }) with track(self.plugin.output_files[0][0]) as t: s = t.read() content = list(s)
def __call__(self, **kw): tracks = kw['TrackMulti']['tracks'] if not isinstance(tracks, list): tracks = [tracks] minscore = kw.get('minscore') maxscore = kw.get('maxscore') minlength = kw.get('minlength') maxlength = kw.get('maxlength') selection = [{'chr': c} for c in kw.get('chrom', '').split(',')] if minscore or maxscore: if not minscore: minscore = -sys.maxint if not maxscore: maxscore = sys.maxint if minscore > maxscore: raise ValueError("Empty range: %f:%f" % (minscore, maxscore)) for s in selection: s['score'] = (float(minscore), float(maxscore)) if minlength or maxlength: minlength = int(minlength or 0) maxlength = int(maxlength or sys.maxint) if minlength > maxlength: raise ValueError("Empty range: %i:%i" % (minlength, maxlength)) for s in selection: s['length'] = (minlength, maxlength) outtracks = [] for tin in [track(t) for t in tracks]: outname = self.temporary_path(tin.name + "_filtered." + tin.format) tout = track(outname) outtracks.append(outname) outstream = tin.read(selection=selection) tout.write(outstream) tout.close() if len(outtracks) > 1: tar_name = self.temporary_path('Filtered_tracks.tgz') tar = tarfile.open(tar_name, "w:gz") [tar.add(f) for f in outtracks] tar.close() self.new_file(tar_name, 'archive') else: self.new_file(outtracks[0], 'output') return self.display_time()
def test_wig(self): wig = os.path.join(path, 'test.wig') t = convert(self.bed, wig) self.assertIsInstance(t, WigTrack) s = t.read() s.next() t.close() t = track(wig, format="wig", chrmeta={}, info=None) s = t.read() s.next() self.assertListEqual(t.fields, ['chr', 'start', 'end', 'score'])
def to_bed(filename, assembly): t = track(filename, fields=['chr', 'start', 'end', 'strand', 'score'], chrmeta=assembly, format='txt') # Translate chr names s = t.read() s1 = map_chromosomes(s, t.assembly.chromosomes) # Prepare output bed file out = track(filename.rstrip('junc') + 'bed', fields=['chr', 'start', 'end', 'name', 'score', 'strand']) out.make_header({'name': filename, 'description': filename}) mode = 'append' # Add junction names c = itertools.count() s2 = duplicate(s1, 'chr', 'name') s3 = apply(s2, 'name', lambda x: 'junction' + str(c.next())) # Write out.write(s3, mode=mode) out.close()