def createLibrary(ex, assembly_or_fasta, params, url=GlobalHtsUrl, via='local'): """ Main call to create the library """ if len(params['primary'])<2: print('Some parameters are missing, cannot create the library') print('primary='+params['primary']+" ; "+'secondary='+params['secondary']) return [None,None,None,None] if not isinstance(assembly_or_fasta,genrep.Assembly): assembly_or_fasta = genrep.Assembly( ex=ex, fasta=assembly_or_fasta ) chrnames = assembly_or_fasta.chrnames chrom_map = dict((v['ac'],k) for k,v in assembly_or_fasta.chrmeta.iteritems()) allfiles = assembly_or_fasta.fasta_by_chrom #assembly_or_fasta.untar_genome_fasta() libfiles = dict((c, getRestEnzymeOccAndSeq.nonblocking( ex, f, params['primary'], params['secondary'], params['length'], params['type'], via=via )) for c, f in allfiles.iteritems()) resfile = unique_filename_in() os.mkdir(resfile) bedfiles = {} for chrom, future in libfiles.iteritems(): libfiles[chrom] = future.wait() if not os.path.getsize(libfiles[chrom][1])>0: time.sleep(60) touch(ex,libfiles[chrom][1]) bedfiles[chrom] = parse_fragFile(libfiles[chrom][1],chrom_map) rescov = coverageInRepeats(ex, bedfiles, params['species'], outdir=resfile, via=via) bedchrom = [os.path.join(resfile,chrom+".bed") for chrom in chrnames] cat(bedchrom,out=resfile+".bed") gzipfile(ex,[resfile+".bed"]+bedchrom) # resfile_sql = resfile+".sql" # track.convert((resfile,'bed'),(resfile_sql,'sql'),assembly=params['species']) enz_list = [] infos_lib = { 'assembly_name': params['species'], 'enzyme1_id': getEnzymeSeqId(params['primary'], True, enz_list, url), 'enzyme2_id': getEnzymeSeqId(params['secondary'], True, enz_list, url), 'segment_length': params['length'], 'type': params['type'], 'filename': resfile } return [ libfiles, bedfiles, resfile, infos_lib ]
def count_reads(self, bamfiles, gtf): self.write_log("* Counting reads") # Count reads on genes, transcripts with "rnacounter" ncond = len(self.conditions) tablenames = [None]*ncond futures = [None]*ncond max_rlen = 0 counter_options = ["--nh"] for bam in bamfiles: sam = pysam.Samfile(bam,'rb') max_rlen = max(max_rlen, sam.next().rlen) counter_options += ["--exon_cutoff", str(max_rlen)] bwt_args = self.job.options.get('map_args',{}).get('bwt_args',[]) # if not "--local" in bwt_args: # counter_options += ["--nh"] if hasattr(self.assembly,"fasta_origin") or self.assembly.intype==2: counter_options += ["--type","transcripts", "--method","raw"] else: counter_options += ["--type","genes,transcripts", "--method","raw,nnls"] if self.stranded: counter_options += ["--stranded"] for i,c in enumerate(self.conditions): tablenames[i] = unique_filename_in() futures[i] = rnacounter.nonblocking(self.ex, bamfiles[i], gtf, stdout=tablenames[i], via=self.via, options=counter_options) # Put samples together for i,c in enumerate(self.conditions): try: futures[i].wait() except Exception as err: self.write_debug("Counting failed: %s." % str(err)) raise err if futures[i] is None: self.write_debug("Counting failed.") raise ValueError("Counting failed.") if len(tablenames) > 1: joined = unique_filename_in() rnacounter_join.nonblocking(self.ex, tablenames, stdout=joined, via=self.via).wait() else: joined = tablenames[0] # Split genes and transcripts into separate files genes_filename = unique_filename_in() trans_filename = unique_filename_in() genes_file = open(genes_filename,"wb") trans_file = open(trans_filename,"wb") if self.stranded: genes_anti_filename = unique_filename_in() trans_anti_filename = unique_filename_in() genes_anti_file = open(genes_anti_filename,"wb") trans_anti_file = open(trans_anti_filename,"wb") with open(joined) as jfile: header = jfile.readline() hconds = ["counts."+c for c in self.conditions] + ["rpkm."+c for c in self.conditions] hinfo = header.strip().split('\t')[2*ncond+1:] header = '\t'.join(["ID"] + hconds + hinfo)+'\n' genes_file.write(header) trans_file.write(header) type_idx = header.split('\t').index("Type") if self.stranded: genes_anti_file.write(header) trans_anti_file.write(header) sense_idx = header.split('\t').index("Sense") for line in jfile: L = line.split('\t') ftype = L[type_idx].lower() sense = L[sense_idx].lower() if ftype == 'gene': if sense == 'antisense': genes_anti_file.write(line) else: genes_file.write(line) elif ftype == 'transcript': if sense == 'antisense': trans_anti_file.write(line) else: trans_file.write(line) else: for line in jfile: L = line.split('\t') ftype = L[type_idx].lower() if ftype == 'gene': genes_file.write(line) elif ftype == 'transcript': trans_file.write(line) genes_file.close() trans_file.close() # Keep intermediate tables for i,c in enumerate(self.conditions): #shutil.copy(tablenames[i], "../counts%d.txt"%i) descr = set_file_descr(self.conditions[i]+'_'+tablenames[i]+'.gz', type='txt', step='pileup', view='admin') gzipfile(self.ex, tablenames[i]) self.ex.add(tablenames[i]+'.gz', description=descr) if self.stranded: count_files = {'genes':genes_filename, 'transcripts':trans_filename, 'genes_anti':genes_anti_filename, 'transcripts_anti':trans_anti_filename} else: count_files = {'genes':genes_filename, 'transcripts':trans_filename} return count_files
def count_reads(self, bamfiles, gtf): self.write_log("* Counting reads") # Count reads on genes, transcripts with "rnacounter" ncond = len(self.conditions) tablenames = [None] * ncond futures = [None] * ncond max_rlen = 0 counter_options = ["--nh"] for bam in bamfiles: sam = pysam.Samfile(bam, 'rb') max_rlen = max(max_rlen, sam.next().rlen) counter_options += ["--exon_cutoff", str(max_rlen)] bwt_args = self.job.options.get('map_args', {}).get('bwt_args', []) # if not "--local" in bwt_args: # counter_options += ["--nh"] if hasattr(self.assembly, "fasta_origin") or self.assembly.intype == 2: counter_options += ["--type", "transcripts", "--method", "raw"] else: counter_options += [ "--type", "genes,transcripts", "--method", "raw,nnls" ] if self.stranded: counter_options += ["--stranded"] for i, c in enumerate(self.conditions): tablenames[i] = unique_filename_in() futures[i] = rnacounter.nonblocking(self.ex, bamfiles[i], gtf, stdout=tablenames[i], via=self.via, options=counter_options) # Put samples together for i, c in enumerate(self.conditions): try: futures[i].wait() except Exception as err: self.write_debug("Counting failed: %s." % str(err)) raise err if futures[i] is None: self.write_debug("Counting failed.") raise ValueError("Counting failed.") if len(tablenames) > 1: joined = unique_filename_in() rnacounter_join.nonblocking(self.ex, tablenames, stdout=joined, via=self.via).wait() else: joined = tablenames[0] # Split genes and transcripts into separate files genes_filename = unique_filename_in() trans_filename = unique_filename_in() genes_file = open(genes_filename, "wb") trans_file = open(trans_filename, "wb") if self.stranded: genes_anti_filename = unique_filename_in() trans_anti_filename = unique_filename_in() genes_anti_file = open(genes_anti_filename, "wb") trans_anti_file = open(trans_anti_filename, "wb") with open(joined) as jfile: header = jfile.readline() hconds = ["counts." + c for c in self.conditions ] + ["rpkm." + c for c in self.conditions] hinfo = header.strip().split('\t')[2 * ncond + 1:] header = '\t'.join(["ID"] + hconds + hinfo) + '\n' genes_file.write(header) trans_file.write(header) type_idx = header.split('\t').index("Type") if self.stranded: genes_anti_file.write(header) trans_anti_file.write(header) sense_idx = header.split('\t').index("Sense") for line in jfile: L = line.split('\t') ftype = L[type_idx].lower() sense = L[sense_idx].lower() if ftype == 'gene': if sense == 'antisense': genes_anti_file.write(line) else: genes_file.write(line) elif ftype == 'transcript': if sense == 'antisense': trans_anti_file.write(line) else: trans_file.write(line) else: for line in jfile: L = line.split('\t') ftype = L[type_idx].lower() if ftype == 'gene': genes_file.write(line) elif ftype == 'transcript': trans_file.write(line) genes_file.close() trans_file.close() # Keep intermediate tables for i, c in enumerate(self.conditions): #shutil.copy(tablenames[i], "../counts%d.txt"%i) descr = set_file_descr(self.conditions[i] + '_' + tablenames[i] + '.gz', type='txt', step='pileup', view='admin') gzipfile(self.ex, tablenames[i]) self.ex.add(tablenames[i] + '.gz', description=descr) if self.stranded: count_files = { 'genes': genes_filename, 'transcripts': trans_filename, 'genes_anti': genes_anti_filename, 'transcripts_anti': trans_anti_filename } else: count_files = { 'genes': genes_filename, 'transcripts': trans_filename } return count_files
def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ): """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'. :param ex: a 'bein' execution environment to run jobs in, :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable, :param assembly: a genrep.Assembly object, :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts. Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows: * ``'-bw'``: 200 ('bandwith') * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control') The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*. Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions. """ options = {} if logfile is None: logfile = sys.stdout if isinstance(job_or_dict,frontend.Job): options = job_or_dict.options groups = job_or_dict.groups mapseq_files = job_or_dict.files elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict: if 'options' in job_or_dict: options = job_or_dict['options'] groups = job_or_dict['groups'] for gid in groups.keys(): if not('name' in groups[gid]): groups[gid]['name'] = gid mapseq_files = job_or_dict.get('files',{}) else: raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.") merge_strands = int(options.get('merge_strands',-1)) suffixes = ["fwd","rev"] peak_deconvolution = options.get('peak_deconvolution',False) if isinstance(peak_deconvolution,basestring): peak_deconvolution = peak_deconvolution.lower() in ['1','true','t'] run_meme = options.get('run_meme',False) if isinstance(run_meme,basestring): run_meme = run_meme.lower() in ['1','true','t'] macs_args = options.get('macs_args',["--bw","200"]) b2w_args = options.get('b2w_args',[]) if not(isinstance(mapseq_files,dict)): raise TypeError("Mapseq_files must be a dictionary.") tests = [] controls = [] names = {'tests': [], 'controls': []} read_length = [] p_thresh = {} for gid,mapped in mapseq_files.iteritems(): group_name = groups[gid]['name'] if not(isinstance(mapped,dict)): raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.") if 'bam' in mapped: mapped = {'_': mapped} futures = {} ptruns = [] for k in mapped.keys(): if not 'libname' in mapped[k]: mapped[k]['libname'] = group_name+"_"+str(k) if not 'stats' in mapped[k]: futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via ) if mapped[k].get('poisson_threshold',-1)>0: ptruns.append(mapped[k]['poisson_threshold']) if len(ptruns)>0: p_thresh['group_name'] = sum(ptruns)/len(ptruns) for k in futures.keys(): mapped[k]['stats'] = f.wait() if len(mapped)>1: bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()]) else: bamfile = mapped.values()[0]['bam'] if groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid,group_name)) else: tests.append(bamfile) names['tests'].append((gid,group_name)) read_length.append(mapped.values()[0]['stats']['read_length']) genome_size = mapped.values()[0]['stats']['genome_size'] if len(controls)<1: controls = [None] names['controls'] = [(0,None)] logfile.write("Starting MACS.\n");logfile.flush() processed = {'macs': add_macs_results( ex, read_length, genome_size, tests, ctrlbam=controls, name=names, poisson_threshold=p_thresh, macs_args=macs_args, via=via ) } logfile.write("Done MACS.\n");logfile.flush() peak_list = {} chrlist = assembly.chrmeta ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6 _select = {'score':(6,sys.maxint)} _fields = ['chr','start','end','name','score'] for i,name in enumerate(names['tests']): if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = track(processed['macs'][ctrl]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select) else: macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select), 'name', lambda __n,_n=xn: "%s:%i" %(__n,_n)) for xn,x in enumerate(names['controls'])]) ############################## macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 ) peak_list[name] = unique_filename_in()+".sql" macs_final = track( peak_list[name], chrmeta=chrlist, info={'datatype':'qualitative'}, fields=['start','end','name','score'] ) macs_final.write(fusion(macs_neighb),clip=True) macs_final.close() ############################## merged_wig = {} options['read_extension'] = int(options.get('read_extension') or read_length[0]) if options['read_extension'] < 1: options['read_extension'] = read_length[0] make_wigs = merge_strands >= 0 or options['read_extension']>100 if options['read_extension'] > 100: options['read_extension'] = 50 for gid,mapped in mapseq_files.iteritems(): # if groups[gid]['control']: continue group_name = groups[gid]['name'] wig = [] for m in mapped.values(): if make_wigs or not('wig' in m) or len(m['wig'])<2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=options['read_extension'], convert=False, b2w_args=b2w_args, via=via ) wig.append(dict((s,output+s+'.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) else: merged_wig[group_name] = wig[0] if peak_deconvolution: ############################## def _filter_deconv( stream, pval ): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields ) ############################## processed['deconv'] = {} for name in names['tests']: logfile.write(name[1]+" deconvolution.\n");logfile.flush() if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = processed['macs'][ctrl]+"_peaks.bed" else: macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed" for x in names['controls']], via=via ) deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta, options['read_extension'], script_path, via=via ) peak_list[name] = unique_filename_in()+".bed" trbed = track(deconv['peaks']).read() with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile: bedfile.write(fusion(_filter_deconv(trbed,0.65))) ex.add(deconv['peaks'], description=set_file_descr(name[1]+'_peaks.sql', type='sql', step='deconvolution', groupId=name[0])) ex.add(deconv['profile'], description=set_file_descr(name[1]+'_deconv.sql', type='sql', step='deconvolution', groupId=name[0])) bigwig = unique_filename_in() try: convert(deconv['profile'],(bigwig,"bigWig")) ex.add(bigwig, description=set_file_descr(name[1]+'_deconv.bw', type='bigWig', ucsc='1', step='deconvolution', groupId=name[0])) except OSError as e: logfile.write(str(e));logfile.flush() ex.add(deconv['pdf'], description=set_file_descr(name[1]+'_deconv.pdf', type='pdf', step='deconvolution', groupId=name[0])) processed['deconv'][name] = deconv ############################## def _join_macs( stream, xlsl, _f ): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:]) yield _p+xlsl[0][nb-1][1:] else: nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:] return FeatureStream( _macs_row(stream), fields=_f ) ############################## peakfile_list = [] for name, plist in peak_list.iteritems(): ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"]) peakfile = unique_filename_in() xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']]) try: ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs" assembly.gene_track() _fields = ['chr','start','end','name','score','gene','location_type','distance']\ +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:])) for chrom in assembly.chrnames: _feat = assembly.gene_track(chrom) peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat), xlsl, _fields), mode='append') except ValueError: _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:])) for chrom in assembly.chrnames: peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append') peakout.close() gzipfile(ex,peakfile) peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields)) ex.add(peakfile+".gz", description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text', step='annotation',groupId=name[0])) stracks = [track(wig,info={'name':name+"_"+st}) for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()] tablefile = unique_filename_in() with open(tablefile,"w") as _tf: _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1] for _s in names['tests'] for _c in names['controls']] _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n") #### need to do something about peak origin (split names, write to separate columns?) for chrom in assembly.chrnames: pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']), 'name', lambda __n,_n=npt: "%s:%i" %(__n,_n)) for npt,pt in enumerate(peakfile_list)] features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], remove_duplicates=True, group_by=['chr','start','end'])) sread = [sig.read(chrom) for sig in stracks] quantifs = score_by_feature(sread, features, method='sum') nidx = quantifs.fields.index('name') _ns = len(tests) _nc = len(controls) with open(tablefile,"a") as _tf: for row in quantifs: pcols = ['']*_ns*_nc _rnsplit = row[nidx].split(":") _n1 = _rnsplit[0] _k = 0 while ( _k < len(_rnsplit)-1-int(_nc>1) ): if _nc > 1: _k += 2 _n2 = _rnsplit[_k-1] _n = _rnsplit[_k].split("|") pcols[int(_n[0])*_nc+int(_n2)] = _n1 else: _k += 1 _n = _rnsplit[_k].split("|") pcols[int(_n[0])] = _n1 _n1 = "|".join(_n[1:]) _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n") gzipfile(ex,tablefile) ex.add(tablefile+".gz", description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text', step='summary')) if run_meme: from bbcflib.motif import parallel_meme logfile.write("Starting MEME.\n");logfile.flush() processed['meme'] = parallel_meme( ex, assembly, peak_list.values(), name=peak_list.keys(), chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'], via=via ) return processed
def c4seq_workflow( ex, job, primers_dict, assembly, c4_url=None, script_path='', logfile=sys.stdout, via='lsf' ): ''' Main * open the 4C-seq minilims and create execution * 0. get/create the library * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql) * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate) ''' mapseq_files = job.files ### outputs processed = {'lib': {}, 'density': {}, '4cseq': {}} processed['4cseq'] = {'density_files' : {}, 'countsPerFrag' : {}, 'countsPerFrag_grp' : {}, 'norm' : {}, 'norm_grp' : {}, 'profileCorrection': {}, 'profileCorrection_grp' : {}, 'smooth_grp' : {}, 'domainogram_grp' : {}, 'bricks2frags' : {}} # was 'smoothFrag': {}, 'domainogram': {}} regToExclude = {} new_libs=[] ### options run_domainogram = {} before_profile_correction = {} if not job.options.get('viewpoints_chrs',False): out_chromosomes = ','.join([ch for ch in assembly.chrnames]) else: out_chromosomes = ','.join([primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] for gid,group in job.groups.iteritems()]) print "out_chromosomes=" + out_chromosomes + "\n" sizeExt = job.options.get('norm_reg',1000000) print "region considered for normalisation: mid viewpoint +/-" + str(sizeExt) + 'bps' ### do it for gid, group in job.groups.iteritems(): run_domainogram[gid] = group.get('run_domainogram',False) if isinstance(run_domainogram[gid],basestring): run_domainogram[gid] = (run_domainogram[gid].lower() in ['1','true','on','t']) before_profile_correction[gid] = group.get('before_profile_correction',False) if isinstance(before_profile_correction[gid],basestring): before_profile_correction[gid] = (before_profile_correction[gid].lower() in ['1','true','on','t']) processed['lib'][gid] = get_libForGrp(ex, group, assembly, new_libs, gid, c4_url, via=via) #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed' processed['4cseq']['density_files'][gid] = {} regToExclude[gid] = primers_dict.get(group['name'],{}).get('regToExclude',"").replace('\r','') # if no regToExclude defined, set it as mid_baitCoord +/-5kb if len(regToExclude[gid])==0 : baitcoord_mid = int(0.5 * (int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[0]) + int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[1]) )) regToExclude[gid] = primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] + ':' + str(baitcoord_mid-5000) + '-' + str(baitcoord_mid+5000) #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()])) print(primers_dict.get(group['name'],{})) print "regToExclude["+str(gid)+"]="+regToExclude[gid] for rid,run in group['runs'].iteritems(): libname = mapseq_files[gid][rid]['libname'] if job.options.get('merge_strands') != 0 or not('wig' in mapseq_files[gid][rid]): density_file=parallel_density_sql( ex, mapseq_files[gid][rid]['bam'], assembly.chrmeta, nreads=mapseq_files[gid][rid]['stats']["total"], merge=0, read_extension=mapseq_files[gid][rid]['stats']['read_length'], convert=False, via=via ) density_file += "merged.sql" ex.add( density_file, description=set_file_descr("density_file_"+libname+".sql", groupId=gid,step="density",type="sql",view='admin',gdv="1") ) else: density_file = mapseq_files[gid][rid]['wig']['merged'] #density_files.append(density_file) processed['4cseq']['density_files'][gid][rid]=density_file # back to grp level! # not anymore: # processed['density'][gid] = merge_sql(ex, density_files, via=via) processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via ) ## access per gid+rid futures_norm = {} countsPerFrags_bedGraph = {} futures_merged_raw = {} for gid, group in job.groups.iteritems(): futures_norm[gid] = {} countsPerFrags_bedGraph[gid] = {} processed['4cseq']['norm'][gid] = {} for rid,run in group['runs'].iteritems(): normfile = unique_filename_in() touch(ex, normfile) resfile = unique_filename_in()+".bedGraph" resfiles = processed['4cseq']['countsPerFrag'][gid][rid] # _all.sql convert(resfiles[3],resfile) countsPerFrags_bedGraph[gid][rid] = resfile print "call normFrags: infiles="+resfile+", normfile="+normfile+"baitCoord="+primers_dict[group['name']]['baitcoord']+", sizeExt=sizeExt, name="+ group['name']+"rep_"+str(rid) + "regToExclude="+regToExclude[gid]+"\n" futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name']+"rep_"+str(rid) ,regToExclude=regToExclude[gid], script_path=script_path, via=via ) processed['4cseq']['norm'][gid][rid] = normfile if len(group) > 1: ## merge replicates before normalisation. mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_raw_mergedRep" print "gid="+group['name'] print "call mergeRep for replicates before normalisation: infiles="+",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['countsPerFrag_grp'][gid] = mergefile else: futures_merged_raw[gid] = None processed['4cseq']['countsPerFrag_grp'][gid] = countsPerFrags_bedGraph[gid][0] #if no replicates, then the file we want is the 1st one print "***** profile correction / sample + merge normalised data" futures_merged = {} # per gid futures_profcor = {} # per gid, per rid for gid, group in job.groups.iteritems(): ## run profile correction per run then merge them futures_profcor[gid] = {} processed['4cseq']['profileCorrection'][gid] = {} for rid, run in group['runs'].iteritems(): # wait for normalisation of all replicates to be finished futures_norm[gid][rid].wait() ## normalised files, per grp, per rep normfile = processed['4cseq']['norm'][gid][rid] file1 = unique_filename_in() #track file touch(ex,file1) file2 = unique_filename_in() #report file touch(ex,file2) file3 = unique_filename_in() #table file touch(ex, file3) print "call profileCorrection: normfile="+normfile+", baitCoord="+primers_dict[group['name']]['baitcoord']+", name="+group['name']+", file1="+file1+", file2="+file2+", file3= "+file3+"\n" futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile, primers_dict[group['name']]['baitcoord'], group['name'], file1, file2, file3, script_path, via=via ) processed['4cseq']['profileCorrection'][gid][rid] = [file1, file2, file3] ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop. if len(group)>1: mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_norm_mergedRep" print "gid="+group['name'] print "call mergeRep: infiles="+",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['norm_grp'][gid] = mergefile else: futures_merged[gid] = None processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][gid][0] ##if no replicates, then the file we want is the 1st one print "***** merge profile corrected data" futures_profcor_merged = {} # per gid for gid, group in job.groups.iteritems(): processed['4cseq']['profileCorrection_grp'][gid] = {} for rid, run in group['runs'].iteritems(): futures_profcor[gid][rid].wait() ## wait for ProfileCorrection to be finished ## merge replicates after profile correction if len(group)>1: mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_ProfCor_mergedRep" pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid,res_rid in processed['4cseq']['profileCorrection'][gid].iteritems()] print "call mergeRep (for PC tables): infiles="+",".join(pcfiles)+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['profileCorrection_grp'][gid] = mergefile else: futures_profcor_merged[gid] = None processed['4cseq']['profileCorrection_grp'][gid] = processed['4cseq']['profileCorrection'][gid][0] ##if no replicates, then the file we want is the 1st one print "***** smooth data" futures_smoothed = {} for gid, group in job.groups.iteritems(): file1 = unique_filename_in() touch(ex,file1) file2 = unique_filename_in() touch(ex, file2) file3 = unique_filename_in() touch(ex, file3) nFragsPerWin = group['window_size'] futures_merged_raw[gid].wait() ## wait for merging of raw_grp to be completed futures_smoothed[gid] = ( smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'], file1, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) futures_merged[gid].wait() ## wait for merging of norm_grp to be completed futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name']+"_norm", file2, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) futures_profcor_merged[gid].wait() # wait for the merging of profile corrected data to be done futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name']+"_fromProfileCorrected", file3, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) processed['4cseq']['smooth_grp'][gid] = [file1,file2,file3] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC] print "***** Domainograms" futures_domainograms = {} for gid, group in job.groups.iteritems(): grName = job.groups[gid]['name'] if run_domainogram[gid]: regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord'] if before_profile_correction[gid]: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid], grName, regCoord=regCoord, skip=1, script_path=script_path, via=via, memory=15 ) else: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], grName, regCoord=regCoord.split(':')[0], skip=1, script_path=script_path, via=via, memory=15 ) ## prepare tar files for domainogram results (if any) ## and create "BRICKS to frags" files print "***** BRICKS to Frags" futures_BRICKS2Frags = {} for gid, f in futures_domainograms.iteritems(): if run_domainogram[gid]: # if domainogram has been run resFiles = [] logFile = f.wait() start = False tarname = job.groups[gid]['name']+"_domainogram.tar.gz" res_tar = tarfile.open(tarname, "w:gz") futures_BRICKS2Frags[gid] = [] processed['4cseq']['bricks2frags'][gid] = [] if logFile is None: continue with open(logFile) as f: for s in f: s = s.strip() if '####resfiles####' in s: start = True elif start and "RData" not in s: resFiles.append(s) res_tar.add(s) if start and "foundBRICKS" in s: bricks2fragsfile = unique_filename_in()+".bedGraph" touch(ex, bricks2fragsfile) futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking(ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4 ) ] processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ] res_tar.close() processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname] ############### prepare tables for global results print "***** combine results into tables " allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): for rid,run in group['runs'].iteritems(): allNames += [ group['name']+"_rep"+str(rid)+"_norm", group['name']+"_rep"+str(rid)+"_fit" ] allFiles += [ processed['4cseq']['profileCorrection'][gid][rid][2] ] allRegToExclude += [ regToExclude[gid] ] tablePC=unique_filename_in()+".txt" print("***will call makeTable with:") print(",".join(allFiles)) print("resfile="+tablePC) print(",".join(allNames)) touch(ex,tablePC) #regToExclude[gid] futures_tables = (makeTable.nonblocking(ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) # wait for all smoothing to be done for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() ## make Table raw/smoothed_raw print("** make Table raw/smoothed_raw") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): futures_merged_raw[gid].wait() allNames += [ group['name']+"_raw", group['name']+"_rawSmoothed" ] allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ] allRegToExclude += [ 'NA', regToExclude[gid] ] tableSmoothedRaw_grp=unique_filename_in()+".txt" touch(ex,tableSmoothedRaw_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## make Table norm/smoothed_norm before PC print("** make Table norm/smoothed_norm befor PC") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): allNames += [ group['name']+"_norm", group['name']+"_smoothed" ] allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ] allRegToExclude += [ regToExclude[gid], regToExclude[gid] ] tableSmoothed_grp=unique_filename_in()+".txt" touch(ex,tableSmoothed_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## make Table norm/smoothed_norm after PC print("** make Table norm/smoothed_norm after PC") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): allNames += [ group['name']+"_normPC", group['name']+"_smoothedPC" ] allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ] allRegToExclude += [ regToExclude[gid], regToExclude[gid] ] tableSmoothedPC_grp=unique_filename_in()+".txt" touch(ex,tableSmoothedPC_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## combine BRICKS2Frags files allNames=[] allFiles=[] for gid, fg in futures_BRICKS2Frags.iteritems(): for f in fg: f.wait() allNames += [ job.groups[gid]['name']+"_BRICKSpval" ] cat_bricks2frags = unique_filename_in()+".txt" print ','.join(processed['4cseq']['bricks2frags'][gid]) cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],out=cat_bricks2frags) allFiles += [ cat_bricks2frags ] for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() tableBRICKS2Frags = unique_filename_in()+".txt" touch(ex,tableBRICKS2Frags) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8 ), ) for f in futures_tables: f.wait() ################ Add everything to minilims below! step = "density" for gid in processed['4cseq']['density_files'].keys(): for rid, sql in processed['4cseq']['density_files'][gid].iteritems(): fname = "density_file_"+job.groups[gid]['name']+"_merged_rep"+str(rid) ex.add( sql, description=set_file_descr( fname+".sql", groupId=gid,step=step,type="sql",gdv="1" ) ) wig = unique_filename_in()+".bw" convert( sql, wig ) ex.add( wig, description=set_file_descr( fname+".bw", groupId=gid,step=step,type="bigWig",ucsc="1") ) step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate for gid in processed['4cseq']['countsPerFrag'].keys(): for rid, resfiles in processed['4cseq']['countsPerFrag'][gid].iteritems(): fname = "meanScorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid) ex.add( resfiles[1], description=set_file_descr( fname+".sql", groupId=gid,step=step,type="sql",view="admin",gdv='1')) #gzipfile(ex,resfiles[0]) #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz", # groupId=gid,step=step,type="bed",view="admin" )) fname = "segToFrag_"+job.groups[gid]['name']+"_rep"+str(rid) ex.add( resfiles[3], description=set_file_descr( fname+"_all.sql", groupId=gid,step=step,type="sql", comment="all informative frags - null included" )) trsql = track(resfiles[3]) bwig = unique_filename_in()+".bw" trwig = track(bwig,chrmeta=trsql.chrmeta) trwig.write(trsql.read(fields=['chr','start','end','score'], selection={'score':(0.01,sys.maxint)})) trwig.close() ex.add( bwig, set_file_descr(fname+".bw",groupId=gid,step=step,type="bigWig",ucsc='1')) ## add segToFrags before normalisation futures_merged_raw[gid].wait() trbedgraph = track(removeNA(processed['4cseq']['countsPerFrag_grp'][gid]),format='bedgraph') bwig = unique_filename_in()+".bw" trwig = track(bwig,chrmeta=assembly.chrmeta) trwig.write(trbedgraph.read(fields=['chr','start','end','score'], selection={'score':(0.01,sys.maxint)})) trwig.close() fname = "segToFrag_"+job.groups[gid]['name'] ex.add( bwig, description=set_file_descr( fname+".bw", groupId=gid,step=step,type="bigWig", comment="segToFrag file before normalisation" )) step = "norm_counts_per_frags" # after new normalisation process, combined replicates for gid, resfile in processed['4cseq']['norm_grp'].iteritems(): fname = "normalised_scorePerFeature_"+job.groups[gid]['name'] gzipfile(ex,resfile) ex.add( resfile+".gz", description=set_file_descr( fname+".bedGraph.gz", groupId=gid,step=step, type="bedGraph",ucsc='1')) # norm files, per replicates (might be removed) for gid, dict_gid in processed['4cseq']['norm'].iteritems(): for rid, resfile in dict_gid.iteritems(): fname = "normalised_scorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid) gzipfile(ex,resfile) ex.add(resfile+".gz", description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) step = "profile_correction" # Profile corrected data, combined replicates for gid, profileCorrectedFile in processed['4cseq']['profileCorrection_grp'].iteritems(): fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected" gzipfile(ex,profileCorrectedFile) ex.add( profileCorrectedFile+".gz", description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) # Profile corrected, per replicate (might be removed) for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems(): for rid, resfiles in dict_gid.iteritems(): # profileCorrectedFile = resfiles[0] reportProfileCorrection = resfiles[1] fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_rep"+str(rid) # gzipfile(ex,profileCorrectedFile) # ex.add( profileCorrectedFile+".gz", # description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ex.add( reportProfileCorrection, description=set_file_descr(fname+".pdf", groupId=gid,step=step,type="pdf")) step = "smoothing" for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems(): rawSmoothFile = resfiles[0] smoothFile = resfiles[1] afterProfileCorrection = resfiles[2] nFrags = str(job.groups[gid]['window_size']) ## smoothed file before normalisation fname = "segToFrag_"+job.groups[gid]['name']+"_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,rawSmoothFile) ex.add(rawSmoothFile+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ## smoothed file after normalisation, before Profile correction fname = "segToFrag_"+job.groups[gid]['name']+"_norm_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,smoothFile) ex.add(smoothFile+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ## smoothed file after normalisation, after Profile correction fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,afterProfileCorrection) ex.add(afterProfileCorrection+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) step = "domainograms" for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems(): tarFile = resfiles.pop() fname = job.groups[gid]['name']+"_domainogram.tar.gz" ex.add(tarFile, description=set_file_descr(fname, groupId=gid,step=step,type="tgz")) for s in resfiles: if s[-8:] == "bedGraph": gzipfile(ex,s) s += ".gz" ex.add( s, description=set_file_descr( s, groupId=gid,step=step,type="bedGraph",ucsc="1",gdv="1")) step = "combined_results" gzipfile(ex,tableSmoothedRaw_grp) ex.add(tableSmoothedRaw_grp+".gz", description=set_file_descr("table_segToFrags_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableSmoothed_grp) ex.add(tableSmoothed_grp+".gz", description=set_file_descr("table_normalised_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableSmoothedPC_grp) ex.add(tableSmoothedPC_grp+".gz", description=set_file_descr("table_profileCorrected_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tablePC) ex.add(tablePC+".gz", description=set_file_descr("table_normalised_fit_per_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableBRICKS2Frags) ex.add(tableBRICKS2Frags+".gz", description=set_file_descr("table_frags_in_BRICKS_combined_replicates.txt.gz",step=step,type="txt")) return processed
def chipseq_workflow(ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf'): """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'. :param ex: a 'bein' execution environment to run jobs in, :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable, :param assembly: a genrep.Assembly object, :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts. Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows: * ``'-bw'``: 200 ('bandwith') * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control') The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*. Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions. """ options = {} if logfile is None: logfile = sys.stdout if isinstance(job_or_dict, frontend.Job): options = job_or_dict.options groups = job_or_dict.groups mapseq_files = job_or_dict.files elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict: if 'options' in job_or_dict: options = job_or_dict['options'] groups = job_or_dict['groups'] for gid in groups.keys(): if not ('name' in groups[gid]): groups[gid]['name'] = gid mapseq_files = job_or_dict.get('files', {}) else: raise TypeError( "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'." ) merge_strands = int(options.get('merge_strands', -1)) suffixes = ["fwd", "rev"] peak_deconvolution = options.get('peak_deconvolution', False) if isinstance(peak_deconvolution, basestring): peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't'] run_meme = options.get('run_meme', False) if isinstance(run_meme, basestring): run_meme = run_meme.lower() in ['1', 'true', 't'] macs_args = options.get('macs_args', ["--bw", "200"]) b2w_args = options.get('b2w_args', []) if not (isinstance(mapseq_files, dict)): raise TypeError("Mapseq_files must be a dictionary.") tests = [] controls = [] names = {'tests': [], 'controls': []} read_length = [] p_thresh = {} for gid, mapped in mapseq_files.iteritems(): group_name = groups[gid]['name'] if not (isinstance(mapped, dict)): raise TypeError( "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'." ) if 'bam' in mapped: mapped = {'_': mapped} futures = {} ptruns = [] for k in mapped.keys(): if not 'libname' in mapped[k]: mapped[k]['libname'] = group_name + "_" + str(k) if not 'stats' in mapped[k]: futures[k] = mapseq.bamstats.nonblocking(ex, mapped[k]["bam"], via=via) if mapped[k].get('poisson_threshold', -1) > 0: ptruns.append(mapped[k]['poisson_threshold']) if len(ptruns) > 0: p_thresh['group_name'] = sum(ptruns) / len(ptruns) for k in futures.keys(): mapped[k]['stats'] = f.wait() if len(mapped) > 1: bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()]) else: bamfile = mapped.values()[0]['bam'] if groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid, group_name)) else: tests.append(bamfile) names['tests'].append((gid, group_name)) read_length.append(mapped.values()[0]['stats']['read_length']) genome_size = mapped.values()[0]['stats']['genome_size'] if len(controls) < 1: controls = [None] names['controls'] = [(0, None)] logfile.write("Starting MACS.\n") logfile.flush() processed = { 'macs': add_macs_results(ex, read_length, genome_size, tests, ctrlbam=controls, name=names, poisson_threshold=p_thresh, macs_args=macs_args, via=via) } logfile.write("Done MACS.\n") logfile.flush() peak_list = {} chrlist = assembly.chrmeta ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6 _select = {'score': (6, sys.maxint)} _fields = ['chr', 'start', 'end', 'name', 'score'] for i, name in enumerate(names['tests']): if len(names['controls']) < 2: ctrl = (name, names['controls'][0]) macsbed = track(processed['macs'][ctrl] + "_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select) else: macsbed = concatenate([ apply(track(processed['macs'][(name, x)] + "_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select), 'name', lambda __n, _n=xn: "%s:%i" % (__n, _n)) for xn, x in enumerate(names['controls']) ]) ############################## macs_neighb = neighborhood(macsbed, before_start=150, after_end=150) peak_list[name] = unique_filename_in() + ".sql" macs_final = track(peak_list[name], chrmeta=chrlist, info={'datatype': 'qualitative'}, fields=['start', 'end', 'name', 'score']) macs_final.write(fusion(macs_neighb), clip=True) macs_final.close() ############################## merged_wig = {} options['read_extension'] = int( options.get('read_extension') or read_length[0]) if options['read_extension'] < 1: options['read_extension'] = read_length[0] make_wigs = merge_strands >= 0 or options['read_extension'] > 100 if options['read_extension'] > 100: options['read_extension'] = 50 for gid, mapped in mapseq_files.iteritems(): # if groups[gid]['control']: continue group_name = groups[gid]['name'] wig = [] for m in mapped.values(): if make_wigs or not ('wig' in m) or len(m['wig']) < 2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=options['read_extension'], convert=False, b2w_args=b2w_args, via=via) wig.append(dict((s, output + s + '.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: merged_wig[group_name] = dict( (s, merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) else: merged_wig[group_name] = wig[0] if peak_deconvolution: ############################## def _filter_deconv(stream, pval): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0], ) + ((x[2] + x[1]) / 2 - 150, (x[2] + x[1]) / 2 + 150) + x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields) ############################## processed['deconv'] = {} for name in names['tests']: logfile.write(name[1] + " deconvolution.\n") logfile.flush() if len(names['controls']) < 2: ctrl = (name, names['controls'][0]) macsbed = processed['macs'][ctrl] + "_peaks.bed" else: macsbed = intersect_many_bed(ex, [ processed['macs'][(name, x)] + "_peaks.bed" for x in names['controls'] ], via=via) deconv = run_deconv(ex, merged_wig[name[1]], macsbed, assembly.chrmeta, options['read_extension'], script_path, via=via) peak_list[name] = unique_filename_in() + ".bed" trbed = track(deconv['peaks']).read() with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile: bedfile.write(fusion(_filter_deconv(trbed, 0.65))) ex.add(deconv['peaks'], description=set_file_descr(name[1] + '_peaks.sql', type='sql', step='deconvolution', groupId=name[0])) ex.add(deconv['profile'], description=set_file_descr(name[1] + '_deconv.sql', type='sql', step='deconvolution', groupId=name[0])) bigwig = unique_filename_in() try: convert(deconv['profile'], (bigwig, "bigWig")) ex.add(bigwig, description=set_file_descr(name[1] + '_deconv.bw', type='bigWig', ucsc='1', step='deconvolution', groupId=name[0])) except OSError as e: logfile.write(str(e)) logfile.flush() ex.add(deconv['pdf'], description=set_file_descr(name[1] + '_deconv.pdf', type='pdf', step='deconvolution', groupId=name[0])) processed['deconv'][name] = deconv ############################## def _join_macs(stream, xlsl, _f): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int( _n.split(";")[0][13:]) if _n[:3] == "ID=" else int( _n[10:]) yield _p + xlsl[0][nb - 1][1:] else: nb = _n.split( ";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:] return FeatureStream(_macs_row(stream), fields=_f) ############################## peakfile_list = [] for name, plist in peak_list.iteritems(): ptrack = track(plist, chrmeta=chrlist, fields=["chr", "start", "end", "name", "score"]) peakfile = unique_filename_in() xlsh, xlsl = parse_MACS_xls([ processed['macs'][(name, _c)] + "_peaks.xls" for _c in names['controls'] ]) try: ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs" assembly.gene_track() _fields = ['chr','start','end','name','score','gene','location_type','distance']\ +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#" + "\t".join([ 'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)', 'location_type', 'distance' ] + _fields[8:])) for chrom in assembly.chrnames: _feat = assembly.gene_track(chrom) peakout.write(_join_macs( getNearestFeature(ptrack.read(selection=chrom), _feat), xlsl, _fields), mode='append') except ValueError: _fields = ['chr', 'start', 'end', 'name', 'score' ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#" + "\t".join( ['chromosome', 'start', 'end', 'info', 'peak_height'] + _fields[8:])) for chrom in assembly.chrnames: peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append') peakout.close() gzipfile(ex, peakfile) peakfile_list.append( track(peakfile + ".gz", format='txt', fields=_fields)) ex.add(peakfile + ".gz", description=set_file_descr(name[1] + '_annotated_peaks.txt.gz', type='text', step='annotation', groupId=name[0])) stracks = [ track(wig, info={'name': name + "_" + st}) for name, wigdict in merged_wig.iteritems() for st, wig in wigdict.iteritems() ] tablefile = unique_filename_in() with open(tablefile, "w") as _tf: _pnames = [ "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1] for _s in names['tests'] for _c in names['controls'] ] _tf.write("\t".join([ '#chromosome', 'start', 'end', ] + _pnames + [s.name for s in stracks]) + "\n") #### need to do something about peak origin (split names, write to separate columns?) for chrom in assembly.chrnames: pk_lst = [ apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']), 'name', lambda __n, _n=npt: "%s:%i" % (__n, _n)) for npt, pt in enumerate(peakfile_list) ] features = fusion( concatenate(pk_lst, fields=['chr', 'start', 'end', 'name'], remove_duplicates=True, group_by=['chr', 'start', 'end'])) sread = [sig.read(chrom) for sig in stracks] quantifs = score_by_feature(sread, features, method='sum') nidx = quantifs.fields.index('name') _ns = len(tests) _nc = len(controls) with open(tablefile, "a") as _tf: for row in quantifs: pcols = [''] * _ns * _nc _rnsplit = row[nidx].split(":") _n1 = _rnsplit[0] _k = 0 while (_k < len(_rnsplit) - 1 - int(_nc > 1)): if _nc > 1: _k += 2 _n2 = _rnsplit[_k - 1] _n = _rnsplit[_k].split("|") pcols[int(_n[0]) * _nc + int(_n2)] = _n1 else: _k += 1 _n = _rnsplit[_k].split("|") pcols[int(_n[0])] = _n1 _n1 = "|".join(_n[1:]) _tf.write("\t".join( str(tt) for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) + "\n") gzipfile(ex, tablefile) ex.add(tablefile + ".gz", description=set_file_descr('Combined_peak_quantifications.txt.gz', type='text', step='summary')) if run_meme: from bbcflib.motif import parallel_meme logfile.write("Starting MEME.\n") logfile.flush() processed['meme'] = parallel_meme( ex, assembly, peak_list.values(), name=peak_list.keys(), chip=True, meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'], via=via) return processed
def c4seq_workflow(ex, job, primers_dict, assembly, c4_url=None, script_path='', logfile=sys.stdout, via='lsf'): ''' Main * open the 4C-seq minilims and create execution * 0. get/create the library * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql) * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate) ''' mapseq_files = job.files ### outputs processed = {'lib': {}, 'density': {}, '4cseq': {}} processed['4cseq'] = { 'density_files': {}, 'countsPerFrag': {}, 'countsPerFrag_grp': {}, 'norm': {}, 'norm_grp': {}, 'profileCorrection': {}, 'profileCorrection_grp': {}, 'smooth_grp': {}, 'domainogram_grp': {}, 'bricks2frags': {} } # was 'smoothFrag': {}, 'domainogram': {}} regToExclude = {} new_libs = [] ### options run_domainogram = {} before_profile_correction = {} if not job.options.get('viewpoints_chrs', False): out_chromosomes = ','.join([ch for ch in assembly.chrnames]) else: out_chromosomes = ','.join([ primers_dict.get(group['name'], {}).get('baitcoord').split(':')[0] for gid, group in job.groups.iteritems() ]) print "out_chromosomes=" + out_chromosomes + "\n" sizeExt = job.options.get('norm_reg', 1000000) print "region considered for normalisation: mid viewpoint +/-" + str( sizeExt) + 'bps' ### do it for gid, group in job.groups.iteritems(): run_domainogram[gid] = group.get('run_domainogram', False) if isinstance(run_domainogram[gid], basestring): run_domainogram[gid] = (run_domainogram[gid].lower() in ['1', 'true', 'on', 't']) before_profile_correction[gid] = group.get('before_profile_correction', False) if isinstance(before_profile_correction[gid], basestring): before_profile_correction[gid] = ( before_profile_correction[gid].lower() in ['1', 'true', 'on', 't']) processed['lib'][gid] = get_libForGrp(ex, group, assembly, new_libs, gid, c4_url, via=via) #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed' processed['4cseq']['density_files'][gid] = {} regToExclude[gid] = primers_dict.get(group['name'], {}).get('regToExclude', "").replace('\r', '') # if no regToExclude defined, set it as mid_baitCoord +/-5kb if len(regToExclude[gid]) == 0: baitcoord_mid = int(0.5 * (int( primers_dict.get(group['name'], {}).get('baitcoord').split(':') [1].split('-')[0]) + int( primers_dict.get(group['name'], {}).get('baitcoord').split( ':')[1].split('-')[1]))) regToExclude[gid] = primers_dict.get( group['name'], {}).get('baitcoord').split(':')[0] + ':' + str( baitcoord_mid - 5000) + '-' + str(baitcoord_mid + 5000) #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()])) print(primers_dict.get(group['name'], {})) print "regToExclude[" + str(gid) + "]=" + regToExclude[gid] for rid, run in group['runs'].iteritems(): libname = mapseq_files[gid][rid]['libname'] if job.options.get('merge_strands') != 0 or not ( 'wig' in mapseq_files[gid][rid]): density_file = parallel_density_sql( ex, mapseq_files[gid][rid]['bam'], assembly.chrmeta, nreads=mapseq_files[gid][rid]['stats']["total"], merge=0, read_extension=mapseq_files[gid][rid]['stats'] ['read_length'], convert=False, via=via) density_file += "merged.sql" ex.add(density_file, description=set_file_descr("density_file_" + libname + ".sql", groupId=gid, step="density", type="sql", view='admin', gdv="1")) else: density_file = mapseq_files[gid][rid]['wig']['merged'] #density_files.append(density_file) processed['4cseq']['density_files'][gid][rid] = density_file # back to grp level! # not anymore: # processed['density'][gid] = merge_sql(ex, density_files, via=via) processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via) ## access per gid+rid futures_norm = {} countsPerFrags_bedGraph = {} futures_merged_raw = {} for gid, group in job.groups.iteritems(): futures_norm[gid] = {} countsPerFrags_bedGraph[gid] = {} processed['4cseq']['norm'][gid] = {} for rid, run in group['runs'].iteritems(): normfile = unique_filename_in() touch(ex, normfile) resfile = unique_filename_in() + ".bedGraph" resfiles = processed['4cseq']['countsPerFrag'][gid][ rid] # _all.sql convert(resfiles[3], resfile) countsPerFrags_bedGraph[gid][rid] = resfile print "call normFrags: infiles=" + resfile + ", normfile=" + normfile + "baitCoord=" + primers_dict[ group['name']][ 'baitcoord'] + ", sizeExt=sizeExt, name=" + group[ 'name'] + "rep_" + str( rid) + "regToExclude=" + regToExclude[gid] + "\n" futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name'] + "rep_" + str(rid), regToExclude=regToExclude[gid], script_path=script_path, via=via) processed['4cseq']['norm'][gid][rid] = normfile if len(group) > 1: ## merge replicates before normalisation. mergefile = unique_filename_in() touch(ex, mergefile) titleName = group['name'] + "_raw_mergedRep" print "gid=" + group['name'] print "call mergeRep for replicates before normalisation: infiles=" + ",".join( [ res_rid for rid, res_rid in countsPerFrags_bedGraph[gid].iteritems() ] ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[ gid] + "\n" futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([ res_rid for rid, res_rid in countsPerFrags_bedGraph[gid].iteritems() ]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via, memory=8) processed['4cseq']['countsPerFrag_grp'][gid] = mergefile else: futures_merged_raw[gid] = None processed['4cseq']['countsPerFrag_grp'][ gid] = countsPerFrags_bedGraph[gid][ 0] #if no replicates, then the file we want is the 1st one print "***** profile correction / sample + merge normalised data" futures_merged = {} # per gid futures_profcor = {} # per gid, per rid for gid, group in job.groups.iteritems(): ## run profile correction per run then merge them futures_profcor[gid] = {} processed['4cseq']['profileCorrection'][gid] = {} for rid, run in group['runs'].iteritems(): # wait for normalisation of all replicates to be finished futures_norm[gid][rid].wait( ) ## normalised files, per grp, per rep normfile = processed['4cseq']['norm'][gid][rid] file1 = unique_filename_in() #track file touch(ex, file1) file2 = unique_filename_in() #report file touch(ex, file2) file3 = unique_filename_in() #table file touch(ex, file3) print "call profileCorrection: normfile=" + normfile + ", baitCoord=" + primers_dict[ group['name']]['baitcoord'] + ", name=" + group[ 'name'] + ", file1=" + file1 + ", file2=" + file2 + ", file3= " + file3 + "\n" futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile, primers_dict[group['name']]['baitcoord'], group['name'], file1, file2, file3, script_path, via=via) processed['4cseq']['profileCorrection'][gid][rid] = [ file1, file2, file3 ] ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop. if len(group) > 1: mergefile = unique_filename_in() touch(ex, mergefile) titleName = group['name'] + "_norm_mergedRep" print "gid=" + group['name'] print "call mergeRep: infiles=" + ",".join([ res_rid for rid, res_rid in processed['4cseq']['norm'] [gid].iteritems() ]) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[ gid] + "\n" futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([ res_rid for rid, res_rid in processed['4cseq']['norm'] [gid].iteritems() ]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via, memory=8) processed['4cseq']['norm_grp'][gid] = mergefile else: futures_merged[gid] = None processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][ gid][ 0] ##if no replicates, then the file we want is the 1st one print "***** merge profile corrected data" futures_profcor_merged = {} # per gid for gid, group in job.groups.iteritems(): processed['4cseq']['profileCorrection_grp'][gid] = {} for rid, run in group['runs'].iteritems(): futures_profcor[gid][rid].wait( ) ## wait for ProfileCorrection to be finished ## merge replicates after profile correction if len(group) > 1: mergefile = unique_filename_in() touch(ex, mergefile) titleName = group['name'] + "_ProfCor_mergedRep" pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid, res_rid in processed['4cseq']['profileCorrection'] [gid].iteritems() ] print "call mergeRep (for PC tables): infiles=" + ",".join( pcfiles ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[ gid] + "\n" futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via, memory=8) processed['4cseq']['profileCorrection_grp'][gid] = mergefile else: futures_profcor_merged[gid] = None processed['4cseq']['profileCorrection_grp'][gid] = processed[ '4cseq']['profileCorrection'][gid][ 0] ##if no replicates, then the file we want is the 1st one print "***** smooth data" futures_smoothed = {} for gid, group in job.groups.iteritems(): file1 = unique_filename_in() touch(ex, file1) file2 = unique_filename_in() touch(ex, file2) file3 = unique_filename_in() touch(ex, file3) nFragsPerWin = group['window_size'] futures_merged_raw[gid].wait( ) ## wait for merging of raw_grp to be completed futures_smoothed[gid] = (smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'], file1, regToExclude[gid], script_path=script_path, via=via, memory=6), ) futures_merged[gid].wait( ) ## wait for merging of norm_grp to be completed futures_smoothed[gid] += (smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name'] + "_norm", file2, regToExclude[gid], script_path=script_path, via=via, memory=6), ) futures_profcor_merged[gid].wait( ) # wait for the merging of profile corrected data to be done futures_smoothed[gid] += (smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name'] + "_fromProfileCorrected", file3, regToExclude[gid], script_path=script_path, via=via, memory=6), ) processed['4cseq']['smooth_grp'][gid] = [ file1, file2, file3 ] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC] print "***** Domainograms" futures_domainograms = {} for gid, group in job.groups.iteritems(): grName = job.groups[gid]['name'] if run_domainogram[gid]: regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord'] if before_profile_correction[gid]: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid], grName, regCoord=regCoord, skip=1, script_path=script_path, via=via, memory=15) else: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], grName, regCoord=regCoord.split(':')[0], skip=1, script_path=script_path, via=via, memory=15) ## prepare tar files for domainogram results (if any) ## and create "BRICKS to frags" files print "***** BRICKS to Frags" futures_BRICKS2Frags = {} for gid, f in futures_domainograms.iteritems(): if run_domainogram[gid]: # if domainogram has been run resFiles = [] logFile = f.wait() start = False tarname = job.groups[gid]['name'] + "_domainogram.tar.gz" res_tar = tarfile.open(tarname, "w:gz") futures_BRICKS2Frags[gid] = [] processed['4cseq']['bricks2frags'][gid] = [] if logFile is None: continue with open(logFile) as f: for s in f: s = s.strip() if '####resfiles####' in s: start = True elif start and "RData" not in s: resFiles.append(s) res_tar.add(s) if start and "foundBRICKS" in s: bricks2fragsfile = unique_filename_in() + ".bedGraph" touch(ex, bricks2fragsfile) futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking( ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4) ] processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ] res_tar.close() processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname] ############### prepare tables for global results print "***** combine results into tables " allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): for rid, run in group['runs'].iteritems(): allNames += [ group['name'] + "_rep" + str(rid) + "_norm", group['name'] + "_rep" + str(rid) + "_fit" ] allFiles += [processed['4cseq']['profileCorrection'][gid][rid][2]] allRegToExclude += [regToExclude[gid]] tablePC = unique_filename_in() + ".txt" print("***will call makeTable with:") print(",".join(allFiles)) print("resfile=" + tablePC) print(",".join(allNames)) touch(ex, tablePC) #regToExclude[gid] futures_tables = (makeTable.nonblocking( ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) # wait for all smoothing to be done for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() ## make Table raw/smoothed_raw print("** make Table raw/smoothed_raw") allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): futures_merged_raw[gid].wait() allNames += [group['name'] + "_raw", group['name'] + "_rawSmoothed"] allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ] allRegToExclude += ['NA', regToExclude[gid]] tableSmoothedRaw_grp = unique_filename_in() + ".txt" touch(ex, tableSmoothedRaw_grp) futures_tables += (makeTable.nonblocking( ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) ## make Table norm/smoothed_norm before PC print("** make Table norm/smoothed_norm befor PC") allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): allNames += [group['name'] + "_norm", group['name'] + "_smoothed"] allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ] allRegToExclude += [regToExclude[gid], regToExclude[gid]] tableSmoothed_grp = unique_filename_in() + ".txt" touch(ex, tableSmoothed_grp) futures_tables += (makeTable.nonblocking( ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) ## make Table norm/smoothed_norm after PC print("** make Table norm/smoothed_norm after PC") allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): allNames += [group['name'] + "_normPC", group['name'] + "_smoothedPC"] allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ] allRegToExclude += [regToExclude[gid], regToExclude[gid]] tableSmoothedPC_grp = unique_filename_in() + ".txt" touch(ex, tableSmoothedPC_grp) futures_tables += (makeTable.nonblocking( ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) ## combine BRICKS2Frags files allNames = [] allFiles = [] for gid, fg in futures_BRICKS2Frags.iteritems(): for f in fg: f.wait() allNames += [job.groups[gid]['name'] + "_BRICKSpval"] cat_bricks2frags = unique_filename_in() + ".txt" print ','.join(processed['4cseq']['bricks2frags'][gid]) cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid], out=cat_bricks2frags) allFiles += [cat_bricks2frags] for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() tableBRICKS2Frags = unique_filename_in() + ".txt" touch(ex, tableBRICKS2Frags) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8), ) for f in futures_tables: f.wait() ################ Add everything to minilims below! step = "density" for gid in processed['4cseq']['density_files'].keys(): for rid, sql in processed['4cseq']['density_files'][gid].iteritems(): fname = "density_file_" + job.groups[gid][ 'name'] + "_merged_rep" + str(rid) ex.add(sql, description=set_file_descr(fname + ".sql", groupId=gid, step=step, type="sql", gdv="1")) wig = unique_filename_in() + ".bw" convert(sql, wig) ex.add(wig, description=set_file_descr(fname + ".bw", groupId=gid, step=step, type="bigWig", ucsc="1")) step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate for gid in processed['4cseq']['countsPerFrag'].keys(): for rid, resfiles in processed['4cseq']['countsPerFrag'][ gid].iteritems(): fname = "meanScorePerFeature_" + job.groups[gid][ 'name'] + "_rep" + str(rid) ex.add(resfiles[1], description=set_file_descr(fname + ".sql", groupId=gid, step=step, type="sql", view="admin", gdv='1')) #gzipfile(ex,resfiles[0]) #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz", # groupId=gid,step=step,type="bed",view="admin" )) fname = "segToFrag_" + job.groups[gid]['name'] + "_rep" + str(rid) ex.add(resfiles[3], description=set_file_descr( fname + "_all.sql", groupId=gid, step=step, type="sql", comment="all informative frags - null included")) trsql = track(resfiles[3]) bwig = unique_filename_in() + ".bw" trwig = track(bwig, chrmeta=trsql.chrmeta) trwig.write( trsql.read(fields=['chr', 'start', 'end', 'score'], selection={'score': (0.01, sys.maxint)})) trwig.close() ex.add( bwig, set_file_descr(fname + ".bw", groupId=gid, step=step, type="bigWig", ucsc='1')) ## add segToFrags before normalisation futures_merged_raw[gid].wait() trbedgraph = track(removeNA( processed['4cseq']['countsPerFrag_grp'][gid]), format='bedgraph') bwig = unique_filename_in() + ".bw" trwig = track(bwig, chrmeta=assembly.chrmeta) trwig.write( trbedgraph.read(fields=['chr', 'start', 'end', 'score'], selection={'score': (0.01, sys.maxint)})) trwig.close() fname = "segToFrag_" + job.groups[gid]['name'] ex.add(bwig, description=set_file_descr( fname + ".bw", groupId=gid, step=step, type="bigWig", comment="segToFrag file before normalisation")) step = "norm_counts_per_frags" # after new normalisation process, combined replicates for gid, resfile in processed['4cseq']['norm_grp'].iteritems(): fname = "normalised_scorePerFeature_" + job.groups[gid]['name'] gzipfile(ex, resfile) ex.add(resfile + ".gz", description=set_file_descr(fname + ".bedGraph.gz", groupId=gid, step=step, type="bedGraph", ucsc='1')) # norm files, per replicates (might be removed) for gid, dict_gid in processed['4cseq']['norm'].iteritems(): for rid, resfile in dict_gid.iteritems(): fname = "normalised_scorePerFeature_" + job.groups[gid][ 'name'] + "_rep" + str(rid) gzipfile(ex, resfile) ex.add(resfile + ".gz", description=set_file_descr(fname + ".bedGraph.gz", groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) step = "profile_correction" # Profile corrected data, combined replicates for gid, profileCorrectedFile in processed['4cseq'][ 'profileCorrection_grp'].iteritems(): fname = "segToFrag_" + job.groups[gid]['name'] + "_profileCorrected" gzipfile(ex, profileCorrectedFile) ex.add(profileCorrectedFile + ".gz", description=set_file_descr(fname + ".bedGraph.gz", groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) # Profile corrected, per replicate (might be removed) for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems(): for rid, resfiles in dict_gid.iteritems(): # profileCorrectedFile = resfiles[0] reportProfileCorrection = resfiles[1] fname = "segToFrag_" + job.groups[gid][ 'name'] + "_profileCorrected_rep" + str(rid) # gzipfile(ex,profileCorrectedFile) # ex.add( profileCorrectedFile+".gz", # description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ex.add(reportProfileCorrection, description=set_file_descr(fname + ".pdf", groupId=gid, step=step, type="pdf")) step = "smoothing" for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems(): rawSmoothFile = resfiles[0] smoothFile = resfiles[1] afterProfileCorrection = resfiles[2] nFrags = str(job.groups[gid]['window_size']) ## smoothed file before normalisation fname = "segToFrag_" + job.groups[gid][ 'name'] + "_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz" gzipfile(ex, rawSmoothFile) ex.add(rawSmoothFile + ".gz", description=set_file_descr(fname, groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) ## smoothed file after normalisation, before Profile correction fname = "segToFrag_" + job.groups[gid][ 'name'] + "_norm_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz" gzipfile(ex, smoothFile) ex.add(smoothFile + ".gz", description=set_file_descr(fname, groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) ## smoothed file after normalisation, after Profile correction fname = "segToFrag_" + job.groups[gid][ 'name'] + "_profileCorrected_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz" gzipfile(ex, afterProfileCorrection) ex.add(afterProfileCorrection + ".gz", description=set_file_descr(fname, groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) step = "domainograms" for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems(): tarFile = resfiles.pop() fname = job.groups[gid]['name'] + "_domainogram.tar.gz" ex.add(tarFile, description=set_file_descr(fname, groupId=gid, step=step, type="tgz")) for s in resfiles: if s[-8:] == "bedGraph": gzipfile(ex, s) s += ".gz" ex.add(s, description=set_file_descr(s, groupId=gid, step=step, type="bedGraph", ucsc="1", gdv="1")) step = "combined_results" gzipfile(ex, tableSmoothedRaw_grp) ex.add(tableSmoothedRaw_grp + ".gz", description=set_file_descr( "table_segToFrags_smoothed_combined_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tableSmoothed_grp) ex.add(tableSmoothed_grp + ".gz", description=set_file_descr( "table_normalised_smoothed_combined_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tableSmoothedPC_grp) ex.add(tableSmoothedPC_grp + ".gz", description=set_file_descr( "table_profileCorrected_smoothed_combined_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tablePC) ex.add(tablePC + ".gz", description=set_file_descr( "table_normalised_fit_per_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tableBRICKS2Frags) ex.add(tableBRICKS2Frags + ".gz", description=set_file_descr( "table_frags_in_BRICKS_combined_replicates.txt.gz", step=step, type="txt")) return processed
def parallel_meme( ex, assembly, regions, name=None, chip=False, meme_args=None, via='lsf' ): """Fetches sequences, then calls ``meme`` on them and finally saves the results in the repository. """ if meme_args is None: meme_args = [] if not(isinstance(regions,list)): regions = [regions] if not(isinstance(name,list)): name = [name or '_'] futures = {} fasta_files = {} background = assembly.statistics(unique_filename_in(),frequency=True) # genomeRef = assembly.untar_genome_fasta() for i,n in enumerate(name): (fasta, size) = assembly.fasta_from_regions( regions[i], ex=ex ) tmpfile = unique_filename_in() outdir = unique_filename_in() if chip: futures[n] = (outdir, memechip.nonblocking( ex, fasta, outdir, background, args=meme_args, via=via, stderr=tmpfile, memory=6 )) else: futures[n] = (outdir, meme.nonblocking( ex, fasta, outdir, background, maxsize=(size*3)/2, args=meme_args, via=via, stderr=tmpfile, memory=6 )) fasta_files[n] = fasta all_res = {} for n,f in futures.iteritems(): f[1].wait() meme_out = f[0] archive = unique_filename_in() tgz = tarfile.open(archive, "w:gz") tgz.add( meme_out, arcname=n[1]+"_meme", exclude=lambda x: os.path.basename(x) in [fasta_files[n],background] ) tgz.close() ex.add( archive, description=set_file_descr(n[1]+"_meme.tgz", step='meme', type='tar', groupId=n[0]) ) gzipfile(ex,fasta_files[n],args=["-f"]) ex.add( fasta_files[n]+".gz", description=set_file_descr(n[1]+"_sites.fa.gz", step='meme', type='fasta', groupId=n[0]) ) if not(chip) and os.path.exists(os.path.join(meme_out, "meme.xml")): meme_res = parse_meme_xml( ex, os.path.join(meme_out, "meme.xml"), assembly.chrmeta ) if os.path.exists(os.path.join(meme_out, "meme.html")): ex.add( os.path.join(meme_out, "meme.html"), description=set_file_descr(n[1]+"_meme.html", step='meme', type='html', groupId=n[0]) ) ex.add( meme_res['sql'], description=set_file_descr(n[1]+"_meme_sites.sql", step='meme', type='sql', groupId=n[0]) ) for i,motif in enumerate(meme_res['matrices'].keys()): ex.add( meme_res['matrices'][motif], description=set_file_descr(n[1]+"_meme_"+motif+".txt", step='meme', type='txt', groupId=n[0]) ) ex.add( os.path.join(meme_out, "logo"+str(i+1)+".png"), description=set_file_descr(n[1]+"_meme_"+motif+".png", step='meme', type='png', groupId=n[0]) ) all_res[n] = meme_res return all_res