Пример #1
0
def createLibrary(ex, assembly_or_fasta, params, url=GlobalHtsUrl, via='local'):
    """
    Main call to create the library
    """
    if len(params['primary'])<2:
        print('Some parameters are missing, cannot create the library')
        print('primary='+params['primary']+" ; "+'secondary='+params['secondary'])
        return [None,None,None,None]

    if not isinstance(assembly_or_fasta,genrep.Assembly):
        assembly_or_fasta = genrep.Assembly( ex=ex, fasta=assembly_or_fasta )
    chrnames = assembly_or_fasta.chrnames
    chrom_map = dict((v['ac'],k) for k,v in assembly_or_fasta.chrmeta.iteritems())
    allfiles = assembly_or_fasta.fasta_by_chrom  #assembly_or_fasta.untar_genome_fasta()

    libfiles = dict((c, getRestEnzymeOccAndSeq.nonblocking( ex, f,
                                                            params['primary'], params['secondary'],
                                                            params['length'],  params['type'],
                                                            via=via ))
                    for c, f in allfiles.iteritems())
    resfile = unique_filename_in()
    os.mkdir(resfile)
    bedfiles = {}
    for chrom, future in libfiles.iteritems():
        libfiles[chrom] = future.wait()
        if not os.path.getsize(libfiles[chrom][1])>0:
            time.sleep(60)
            touch(ex,libfiles[chrom][1])
        bedfiles[chrom] = parse_fragFile(libfiles[chrom][1],chrom_map)
    rescov = coverageInRepeats(ex, bedfiles, params['species'], outdir=resfile, via=via)
    bedchrom = [os.path.join(resfile,chrom+".bed") for chrom in chrnames]
    cat(bedchrom,out=resfile+".bed")
    gzipfile(ex,[resfile+".bed"]+bedchrom)
#    resfile_sql = resfile+".sql"
#    track.convert((resfile,'bed'),(resfile_sql,'sql'),assembly=params['species'])
    enz_list = []
    infos_lib = { 'assembly_name':  params['species'],
                  'enzyme1_id':     getEnzymeSeqId(params['primary'], True, enz_list, url),
                  'enzyme2_id':     getEnzymeSeqId(params['secondary'], True, enz_list, url),
                  'segment_length': params['length'],
                  'type':           params['type'],
                  'filename':       resfile }
    return [ libfiles, bedfiles, resfile, infos_lib ]
Пример #2
0
    def count_reads(self, bamfiles, gtf):
        self.write_log("* Counting reads")

        # Count reads on genes, transcripts with "rnacounter"
        ncond = len(self.conditions)
        tablenames = [None]*ncond
        futures = [None]*ncond
        max_rlen = 0
        counter_options = ["--nh"]
        for bam in bamfiles:
            sam = pysam.Samfile(bam,'rb')
            max_rlen = max(max_rlen, sam.next().rlen)
        counter_options += ["--exon_cutoff", str(max_rlen)]
        bwt_args = self.job.options.get('map_args',{}).get('bwt_args',[])
#        if not "--local" in bwt_args:
#            counter_options += ["--nh"]
        if hasattr(self.assembly,"fasta_origin") or self.assembly.intype==2:
            counter_options += ["--type","transcripts", "--method","raw"]
        else:
            counter_options += ["--type","genes,transcripts", "--method","raw,nnls"]
        if self.stranded:
            counter_options += ["--stranded"]
        for i,c in enumerate(self.conditions):
            tablenames[i] = unique_filename_in()
            futures[i] = rnacounter.nonblocking(self.ex, bamfiles[i], gtf, stdout=tablenames[i], via=self.via,
                               options=counter_options)

        # Put samples together
        for i,c in enumerate(self.conditions):
            try:
                futures[i].wait()
            except Exception as err:
                self.write_debug("Counting failed: %s." % str(err))
                raise err
            if futures[i] is None:
                self.write_debug("Counting failed.")
                raise ValueError("Counting failed.")
        if len(tablenames) > 1:
            joined = unique_filename_in()
            rnacounter_join.nonblocking(self.ex, tablenames, stdout=joined, via=self.via).wait()
        else:
            joined = tablenames[0]

        # Split genes and transcripts into separate files
        genes_filename = unique_filename_in()
        trans_filename = unique_filename_in()
        genes_file = open(genes_filename,"wb")
        trans_file = open(trans_filename,"wb")
        if self.stranded:
            genes_anti_filename = unique_filename_in()
            trans_anti_filename = unique_filename_in()
            genes_anti_file = open(genes_anti_filename,"wb")
            trans_anti_file = open(trans_anti_filename,"wb")
        with open(joined) as jfile:
            header = jfile.readline()
            hconds = ["counts."+c for c in self.conditions] + ["rpkm."+c for c in self.conditions]
            hinfo = header.strip().split('\t')[2*ncond+1:]
            header = '\t'.join(["ID"] + hconds + hinfo)+'\n'
            genes_file.write(header)
            trans_file.write(header)
            type_idx = header.split('\t').index("Type")
            if self.stranded:
                genes_anti_file.write(header)
                trans_anti_file.write(header)
                sense_idx = header.split('\t').index("Sense")
                for line in jfile:
                    L = line.split('\t')
                    ftype = L[type_idx].lower()
                    sense = L[sense_idx].lower()
                    if ftype == 'gene':
                        if sense == 'antisense':
                            genes_anti_file.write(line)
                        else:
                            genes_file.write(line)
                    elif ftype == 'transcript':
                        if sense == 'antisense':
                            trans_anti_file.write(line)
                        else:
                            trans_file.write(line)
            else:
                for line in jfile:
                    L = line.split('\t')
                    ftype = L[type_idx].lower()
                    if ftype == 'gene':
                        genes_file.write(line)
                    elif ftype == 'transcript':
                        trans_file.write(line)
        genes_file.close()
        trans_file.close()

        # Keep intermediate tables
        for i,c in enumerate(self.conditions):
            #shutil.copy(tablenames[i], "../counts%d.txt"%i)
            descr = set_file_descr(self.conditions[i]+'_'+tablenames[i]+'.gz', type='txt', step='pileup', view='admin')
            gzipfile(self.ex, tablenames[i])
            self.ex.add(tablenames[i]+'.gz', description=descr)

        if self.stranded:
            count_files = {'genes':genes_filename, 'transcripts':trans_filename,
                           'genes_anti':genes_anti_filename, 'transcripts_anti':trans_anti_filename}
        else:
            count_files = {'genes':genes_filename, 'transcripts':trans_filename}
        return count_files
Пример #3
0
    def count_reads(self, bamfiles, gtf):
        self.write_log("* Counting reads")

        # Count reads on genes, transcripts with "rnacounter"
        ncond = len(self.conditions)
        tablenames = [None] * ncond
        futures = [None] * ncond
        max_rlen = 0
        counter_options = ["--nh"]
        for bam in bamfiles:
            sam = pysam.Samfile(bam, 'rb')
            max_rlen = max(max_rlen, sam.next().rlen)
        counter_options += ["--exon_cutoff", str(max_rlen)]
        bwt_args = self.job.options.get('map_args', {}).get('bwt_args', [])
        #        if not "--local" in bwt_args:
        #            counter_options += ["--nh"]
        if hasattr(self.assembly, "fasta_origin") or self.assembly.intype == 2:
            counter_options += ["--type", "transcripts", "--method", "raw"]
        else:
            counter_options += [
                "--type", "genes,transcripts", "--method", "raw,nnls"
            ]
        if self.stranded:
            counter_options += ["--stranded"]
        for i, c in enumerate(self.conditions):
            tablenames[i] = unique_filename_in()
            futures[i] = rnacounter.nonblocking(self.ex,
                                                bamfiles[i],
                                                gtf,
                                                stdout=tablenames[i],
                                                via=self.via,
                                                options=counter_options)

        # Put samples together
        for i, c in enumerate(self.conditions):
            try:
                futures[i].wait()
            except Exception as err:
                self.write_debug("Counting failed: %s." % str(err))
                raise err
            if futures[i] is None:
                self.write_debug("Counting failed.")
                raise ValueError("Counting failed.")
        if len(tablenames) > 1:
            joined = unique_filename_in()
            rnacounter_join.nonblocking(self.ex,
                                        tablenames,
                                        stdout=joined,
                                        via=self.via).wait()
        else:
            joined = tablenames[0]

        # Split genes and transcripts into separate files
        genes_filename = unique_filename_in()
        trans_filename = unique_filename_in()
        genes_file = open(genes_filename, "wb")
        trans_file = open(trans_filename, "wb")
        if self.stranded:
            genes_anti_filename = unique_filename_in()
            trans_anti_filename = unique_filename_in()
            genes_anti_file = open(genes_anti_filename, "wb")
            trans_anti_file = open(trans_anti_filename, "wb")
        with open(joined) as jfile:
            header = jfile.readline()
            hconds = ["counts." + c for c in self.conditions
                      ] + ["rpkm." + c for c in self.conditions]
            hinfo = header.strip().split('\t')[2 * ncond + 1:]
            header = '\t'.join(["ID"] + hconds + hinfo) + '\n'
            genes_file.write(header)
            trans_file.write(header)
            type_idx = header.split('\t').index("Type")
            if self.stranded:
                genes_anti_file.write(header)
                trans_anti_file.write(header)
                sense_idx = header.split('\t').index("Sense")
                for line in jfile:
                    L = line.split('\t')
                    ftype = L[type_idx].lower()
                    sense = L[sense_idx].lower()
                    if ftype == 'gene':
                        if sense == 'antisense':
                            genes_anti_file.write(line)
                        else:
                            genes_file.write(line)
                    elif ftype == 'transcript':
                        if sense == 'antisense':
                            trans_anti_file.write(line)
                        else:
                            trans_file.write(line)
            else:
                for line in jfile:
                    L = line.split('\t')
                    ftype = L[type_idx].lower()
                    if ftype == 'gene':
                        genes_file.write(line)
                    elif ftype == 'transcript':
                        trans_file.write(line)
        genes_file.close()
        trans_file.close()

        # Keep intermediate tables
        for i, c in enumerate(self.conditions):
            #shutil.copy(tablenames[i], "../counts%d.txt"%i)
            descr = set_file_descr(self.conditions[i] + '_' + tablenames[i] +
                                   '.gz',
                                   type='txt',
                                   step='pileup',
                                   view='admin')
            gzipfile(self.ex, tablenames[i])
            self.ex.add(tablenames[i] + '.gz', description=descr)

        if self.stranded:
            count_files = {
                'genes': genes_filename,
                'transcripts': trans_filename,
                'genes_anti': genes_anti_filename,
                'transcripts_anti': trans_anti_filename
            }
        else:
            count_files = {
                'genes': genes_filename,
                'transcripts': trans_filename
            }
        return count_files
Пример #4
0
def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict,frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files',{})
    else:
        raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.")
    merge_strands = int(options.get('merge_strands',-1))
    suffixes = ["fwd","rev"]
    peak_deconvolution = options.get('peak_deconvolution',False)
    if isinstance(peak_deconvolution,basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1','true','t']
    run_meme = options.get('run_meme',False)
    if isinstance(run_meme,basestring):
        run_meme = run_meme.lower() in ['1','true','t']
    macs_args = options.get('macs_args',["--bw","200"])
    b2w_args = options.get('b2w_args',[])
    if not(isinstance(mapseq_files,dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid,mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not(isinstance(mapped,dict)):
            raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.")
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name+"_"+str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via )
            if mapped[k].get('poisson_threshold',-1)>0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns)>0:
            p_thresh['group_name'] = sum(ptruns)/len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped)>1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid,group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid,group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls)<1:
        controls = [None]
        names['controls'] = [(0,None)]
    logfile.write("Starting MACS.\n");logfile.flush()
    processed = {'macs': add_macs_results( ex, read_length, genome_size,
                                           tests, ctrlbam=controls, name=names,
                                           poisson_threshold=p_thresh,
                                           macs_args=macs_args, via=via ) }
    logfile.write("Done MACS.\n");logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score':(6,sys.maxint)}
    _fields = ['chr','start','end','name','score']
    for i,name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name,names['controls'][0])
            macsbed = track(processed['macs'][ctrl]+"_summits.bed",
                            chrmeta=chrlist, fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed",
                                         chrmeta=chrlist, fields=_fields).read(selection=_select),
                                         'name', lambda __n,_n=xn: "%s:%i" %(__n,_n))
                                   for xn,x in enumerate(names['controls'])])
        ##############################
        macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 )
        peak_list[name] = unique_filename_in()+".sql"
        macs_final = track( peak_list[name], chrmeta=chrlist,
                            info={'datatype':'qualitative'},
                            fields=['start','end','name','score'] )
        macs_final.write(fusion(macs_neighb),clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1: options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension']>100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid,mapped in mapseq_files.iteritems():
#            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not('wig' in m) or len(m['wig'])<2:
                output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta,
                                                      nreads=m["stats"]["total"],
                                                      merge=-1, read_extension=options['read_extension'],
                                                      convert=False,
                                                      b2w_args=b2w_args, via=via )
                wig.append(dict((s,output+s+'.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via))
                                          for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv( stream, pval ):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] 
                                   for x in stream 
                                   if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), 
                                  fields=stream.fields )
        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1]+" deconvolution.\n");logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name,names['controls'][0])
                macsbed = processed['macs'][ctrl]+"_peaks.bed"
            else:
                macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed"
                                                   for x in names['controls']], via=via )
            deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta,
                                 options['read_extension'], script_path, via=via )
            peak_list[name] = unique_filename_in()+".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed,0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1]+'_peaks.sql', type='sql',
                                              step='deconvolution', groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1]+'_deconv.sql', type='sql',
                                              step='deconvolution',  groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'],(bigwig,"bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1]+'_deconv.bw', type='bigWig',
                                                  ucsc='1', step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e));logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1]+'_deconv.pdf', type='pdf',
                                              step='deconvolution', groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs( stream, xlsl, _f ):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:])
                        yield _p+xlsl[0][nb-1][1:]
                    else:
                        nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:]
        return FeatureStream( _macs_row(stream), fields=_f )
    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']])
        try:
###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat),
                                         xlsl, _fields), mode='append')
        except ValueError:
            _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append')
        peakout.close()
        gzipfile(ex,peakfile)
        peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields))
        ex.add(peakfile+".gz",
               description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text',
                                          step='annotation',groupId=name[0]))
    stracks = [track(wig,info={'name':name+"_"+st}) 
               for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()]
    tablefile = unique_filename_in()
    with open(tablefile,"w") as _tf:
        _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1]
                   for _s in names['tests'] for _c in names['controls']]
        _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']),
                        'name', lambda __n,_n=npt: "%s:%i" %(__n,_n))
                  for npt,pt in enumerate(peakfile_list)]
        features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], 
                                      remove_duplicates=True, group_by=['chr','start','end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile,"a") as _tf:
            for row in quantifs:
                pcols = ['']*_ns*_nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while ( _k < len(_rnsplit)-1-int(_nc>1) ):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k-1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])*_nc+int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n")
    gzipfile(ex,tablefile)
    ex.add(tablefile+".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n");logfile.flush()
        processed['meme'] = parallel_meme( ex, assembly,
                                           peak_list.values(), name=peak_list.keys(),
                                           chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'],
                                           via=via )
    return processed
Пример #5
0
def c4seq_workflow( ex, job, primers_dict, assembly,
                    c4_url=None, script_path='', logfile=sys.stdout, via='lsf' ):
    '''
    Main
    * open the 4C-seq minilims and create execution
    * 0. get/create the library
    * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql)
    * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate)
    '''

    mapseq_files = job.files
### outputs
    processed = {'lib': {}, 'density': {}, '4cseq': {}}
    processed['4cseq'] = {'density_files' : {},
                          'countsPerFrag' : {},
                          'countsPerFrag_grp' : {},
                          'norm' : {},
                          'norm_grp' : {},
                          'profileCorrection': {},
                          'profileCorrection_grp' : {},
                          'smooth_grp' : {},
                          'domainogram_grp' : {},
                          'bricks2frags' : {}}
                            # was 'smoothFrag': {}, 'domainogram': {}}
    regToExclude = {}
    new_libs=[]
### options
    run_domainogram = {}
    before_profile_correction = {}
    if not job.options.get('viewpoints_chrs',False):
        out_chromosomes = ','.join([ch for ch in assembly.chrnames])
    else:
        out_chromosomes = ','.join([primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] for gid,group in job.groups.iteritems()])
    print "out_chromosomes=" + out_chromosomes + "\n"

    sizeExt = job.options.get('norm_reg',1000000)
    print "region considered for normalisation: mid viewpoint +/-" + str(sizeExt) + 'bps'

### do it
    for gid, group in job.groups.iteritems():
        run_domainogram[gid] = group.get('run_domainogram',False)
        if isinstance(run_domainogram[gid],basestring):
            run_domainogram[gid] = (run_domainogram[gid].lower() in ['1','true','on','t'])
        before_profile_correction[gid] = group.get('before_profile_correction',False)
        if isinstance(before_profile_correction[gid],basestring):
            before_profile_correction[gid] = (before_profile_correction[gid].lower() in ['1','true','on','t'])
        processed['lib'][gid] = get_libForGrp(ex, group, assembly,
                                              new_libs, gid, c4_url, via=via)
#reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed'
        processed['4cseq']['density_files'][gid] = {}
        regToExclude[gid] = primers_dict.get(group['name'],{}).get('regToExclude',"").replace('\r','')

        # if no regToExclude defined, set it as mid_baitCoord +/-5kb
        if len(regToExclude[gid])==0 :
            baitcoord_mid = int(0.5 * (int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[0]) + int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[1]) ))
            regToExclude[gid] = primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] + ':' + str(baitcoord_mid-5000) + '-' + str(baitcoord_mid+5000)

        #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()]))
        print(primers_dict.get(group['name'],{}))
        print "regToExclude["+str(gid)+"]="+regToExclude[gid]
        for rid,run in group['runs'].iteritems():
            libname = mapseq_files[gid][rid]['libname']
            if job.options.get('merge_strands') != 0 or not('wig' in mapseq_files[gid][rid]):
                density_file=parallel_density_sql( ex, mapseq_files[gid][rid]['bam'],
                                                   assembly.chrmeta,
                                                   nreads=mapseq_files[gid][rid]['stats']["total"],
                                                   merge=0,
                                                   read_extension=mapseq_files[gid][rid]['stats']['read_length'],
                                                   convert=False,
                                                   via=via )
                density_file += "merged.sql"
                ex.add( density_file,
                        description=set_file_descr("density_file_"+libname+".sql",
                                                   groupId=gid,step="density",type="sql",view='admin',gdv="1") )
            else:
                density_file = mapseq_files[gid][rid]['wig']['merged']
            #density_files.append(density_file)
            processed['4cseq']['density_files'][gid][rid]=density_file

        # back to grp level!
        # not anymore:
        # processed['density'][gid] = merge_sql(ex, density_files, via=via)

    processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via )
    ## access per gid+rid

    futures_norm = {}
    countsPerFrags_bedGraph = {}
    futures_merged_raw = {}
    for gid, group in job.groups.iteritems():
        futures_norm[gid] = {}
        countsPerFrags_bedGraph[gid] = {}
        processed['4cseq']['norm'][gid] = {}
        for rid,run in group['runs'].iteritems():
            normfile = unique_filename_in()
            touch(ex, normfile)
            resfile = unique_filename_in()+".bedGraph"
            resfiles = processed['4cseq']['countsPerFrag'][gid][rid] # _all.sql
            convert(resfiles[3],resfile)
            countsPerFrags_bedGraph[gid][rid] = resfile

            print "call normFrags: infiles="+resfile+", normfile="+normfile+"baitCoord="+primers_dict[group['name']]['baitcoord']+", sizeExt=sizeExt, name="+ group['name']+"rep_"+str(rid) + "regToExclude="+regToExclude[gid]+"\n"
            futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name']+"rep_"+str(rid) ,regToExclude=regToExclude[gid], script_path=script_path, via=via )
            processed['4cseq']['norm'][gid][rid] = normfile

        if len(group) > 1:
            ## merge replicates before normalisation.
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_raw_mergedRep"
            print "gid="+group['name']
            print "call mergeRep for replicates before normalisation: infiles="+",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['countsPerFrag_grp'][gid] = mergefile
        else:
            futures_merged_raw[gid] = None
            processed['4cseq']['countsPerFrag_grp'][gid] = countsPerFrags_bedGraph[gid][0] #if no replicates, then the file we want is the 1st one

    print "***** profile correction / sample + merge normalised data"
    futures_merged = {} # per gid
    futures_profcor = {} # per gid, per rid
    for gid, group in job.groups.iteritems():
        ## run profile correction per run then merge them
        futures_profcor[gid] = {}
        processed['4cseq']['profileCorrection'][gid] = {}
        for rid, run in group['runs'].iteritems():
            # wait for normalisation of all replicates to be finished
            futures_norm[gid][rid].wait() ## normalised files, per grp, per rep
            normfile = processed['4cseq']['norm'][gid][rid]
            file1 = unique_filename_in() #track file
            touch(ex,file1)
            file2 = unique_filename_in() #report file
            touch(ex,file2)
            file3 = unique_filename_in() #table file
            touch(ex, file3)
            print "call profileCorrection: normfile="+normfile+", baitCoord="+primers_dict[group['name']]['baitcoord']+", name="+group['name']+", file1="+file1+", file2="+file2+", file3= "+file3+"\n"
            futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile,
                                        primers_dict[group['name']]['baitcoord'],
                                        group['name'], file1, file2, file3, script_path,
                                        via=via )
            processed['4cseq']['profileCorrection'][gid][rid] = [file1, file2, file3]

        ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop.
        if len(group)>1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_norm_mergedRep"
            print "gid="+group['name']
            print "call mergeRep: infiles="+",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['norm_grp'][gid] = mergefile
        else:
            futures_merged[gid] = None
            processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][gid][0] ##if no replicates, then the file we want is the 1st one

    print "***** merge profile corrected data"
    futures_profcor_merged = {} # per gid
    for gid, group in job.groups.iteritems():
        processed['4cseq']['profileCorrection_grp'][gid] = {}
        for rid, run in group['runs'].iteritems():
            futures_profcor[gid][rid].wait()   ## wait for ProfileCorrection to be finished

        ## merge replicates after profile correction
        if len(group)>1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName=group['name']+"_ProfCor_mergedRep"
            pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid,res_rid in processed['4cseq']['profileCorrection'][gid].iteritems()]
            print "call mergeRep (for PC tables): infiles="+",".join(pcfiles)+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n"
            futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8)
            processed['4cseq']['profileCorrection_grp'][gid] = mergefile
        else:
            futures_profcor_merged[gid] = None
            processed['4cseq']['profileCorrection_grp'][gid] = processed['4cseq']['profileCorrection'][gid][0] ##if no replicates, then the file we want is the 1st one


    print "***** smooth data"
    futures_smoothed = {}
    for gid, group in job.groups.iteritems():
        file1 = unique_filename_in()
        touch(ex,file1)
        file2 = unique_filename_in()
        touch(ex, file2)
        file3 = unique_filename_in()
        touch(ex, file3)
        nFragsPerWin = group['window_size']
        futures_merged_raw[gid].wait() ## wait for merging of raw_grp to be completed
        futures_smoothed[gid] = ( smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'],
                                                    file1, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        futures_merged[gid].wait() ## wait for merging of norm_grp to be completed
        futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name']+"_norm",
                                                    file2, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        futures_profcor_merged[gid].wait() # wait for the merging of profile corrected data to be done
        futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name']+"_fromProfileCorrected",
                                                    file3, regToExclude[gid], script_path=script_path, via=via, memory=6 ), )
        processed['4cseq']['smooth_grp'][gid] = [file1,file2,file3] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC]

    print "***** Domainograms"
    futures_domainograms = {}
    for gid, group in job.groups.iteritems():
        grName = job.groups[gid]['name']
        if run_domainogram[gid]:
            regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord']
            if before_profile_correction[gid]:
               futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid],
                                                                            grName, regCoord=regCoord, skip=1,
                                                                            script_path=script_path, via=via, memory=15 )
            else:
                futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid],
                                                                            grName, regCoord=regCoord.split(':')[0], skip=1,
                                                                            script_path=script_path, via=via, memory=15 )

    ## prepare tar files for domainogram results (if any)
    ## and create "BRICKS to frags" files
    print "***** BRICKS to Frags"
    futures_BRICKS2Frags = {}
    for gid, f in futures_domainograms.iteritems():
        if run_domainogram[gid]: # if domainogram has been run
            resFiles = []
            logFile = f.wait()
            start = False
            tarname = job.groups[gid]['name']+"_domainogram.tar.gz"
            res_tar = tarfile.open(tarname, "w:gz")
            futures_BRICKS2Frags[gid] = []
            processed['4cseq']['bricks2frags'][gid] = []
            if logFile is None: continue
            with open(logFile) as f:
                for s in f:
                    s = s.strip()
                    if '####resfiles####' in s:
                        start = True
                    elif start and "RData" not in s:
                        resFiles.append(s)
                        res_tar.add(s)
                    if start and "foundBRICKS" in s:
                        bricks2fragsfile = unique_filename_in()+".bedGraph"
                        touch(ex, bricks2fragsfile)
                        futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking(ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4 ) ]
                        processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ]
            res_tar.close()
            processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname]




############### prepare tables for global results
    print "***** combine results into tables "
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        for rid,run in group['runs'].iteritems():
            allNames += [ group['name']+"_rep"+str(rid)+"_norm", group['name']+"_rep"+str(rid)+"_fit" ]
            allFiles += [ processed['4cseq']['profileCorrection'][gid][rid][2] ]
            allRegToExclude += [ regToExclude[gid] ]
    tablePC=unique_filename_in()+".txt"
    print("***will call makeTable with:")
    print(",".join(allFiles))
    print("resfile="+tablePC)
    print(",".join(allNames))
    touch(ex,tablePC)

    #regToExclude[gid]

    futures_tables = (makeTable.nonblocking(ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    # wait for all smoothing to be done
    for gid, fg in futures_smoothed.iteritems():
        for f in fg: f.wait()

    ## make Table raw/smoothed_raw
    print("** make Table raw/smoothed_raw")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        futures_merged_raw[gid].wait()
        allNames += [ group['name']+"_raw", group['name']+"_rawSmoothed" ]
        allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ]
        allRegToExclude += [ 'NA', regToExclude[gid] ]

    tableSmoothedRaw_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothedRaw_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## make Table norm/smoothed_norm before PC
    print("** make Table norm/smoothed_norm befor PC")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        allNames += [ group['name']+"_norm", group['name']+"_smoothed" ]
        allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ]
        allRegToExclude += [ regToExclude[gid], regToExclude[gid] ]

    tableSmoothed_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothed_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## make Table norm/smoothed_norm after PC
    print("** make Table norm/smoothed_norm after PC")
    allNames=[]
    allFiles=[]
    allRegToExclude=[]
    for gid, group in job.groups.iteritems():
        allNames += [ group['name']+"_normPC", group['name']+"_smoothedPC" ]
        allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ]
        allRegToExclude += [ regToExclude[gid], regToExclude[gid] ]

    tableSmoothedPC_grp=unique_filename_in()+".txt"
    touch(ex,tableSmoothedPC_grp)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), )

    ## combine BRICKS2Frags files
    allNames=[]
    allFiles=[]
    for gid, fg in futures_BRICKS2Frags.iteritems():
        for f in fg: f.wait()
        allNames += [ job.groups[gid]['name']+"_BRICKSpval" ]
        cat_bricks2frags = unique_filename_in()+".txt"
        print ','.join(processed['4cseq']['bricks2frags'][gid])
        cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],out=cat_bricks2frags)
        allFiles += [ cat_bricks2frags ]

    for gid, fg in futures_smoothed.iteritems():
        for f in fg: f.wait()

    tableBRICKS2Frags = unique_filename_in()+".txt"
    touch(ex,tableBRICKS2Frags)
    futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8 ), )


    for f in futures_tables: f.wait()


################ Add everything to minilims below!
    step = "density"
    for gid in processed['4cseq']['density_files'].keys():
        for rid, sql in processed['4cseq']['density_files'][gid].iteritems():
            fname = "density_file_"+job.groups[gid]['name']+"_merged_rep"+str(rid)
            ex.add( sql, description=set_file_descr( fname+".sql",
                                                 groupId=gid,step=step,type="sql",gdv="1" ) )
            wig = unique_filename_in()+".bw"
            convert( sql, wig )
            ex.add( wig, description=set_file_descr( fname+".bw",
                                                 groupId=gid,step=step,type="bigWig",ucsc="1") )
    step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate
    for gid in processed['4cseq']['countsPerFrag'].keys():
        for rid, resfiles in processed['4cseq']['countsPerFrag'][gid].iteritems():
            fname = "meanScorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid)
            ex.add( resfiles[1], description=set_file_descr( fname+".sql",
                                                             groupId=gid,step=step,type="sql",view="admin",gdv='1'))
            #gzipfile(ex,resfiles[0])
            #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz",
            #                                                       groupId=gid,step=step,type="bed",view="admin" ))
            fname = "segToFrag_"+job.groups[gid]['name']+"_rep"+str(rid)
            ex.add( resfiles[3], description=set_file_descr( fname+"_all.sql",
                                                             groupId=gid,step=step,type="sql",
                                                             comment="all informative frags - null included" ))
            trsql = track(resfiles[3])
            bwig = unique_filename_in()+".bw"
            trwig = track(bwig,chrmeta=trsql.chrmeta)
            trwig.write(trsql.read(fields=['chr','start','end','score'],
                                   selection={'score':(0.01,sys.maxint)}))
            trwig.close()
            ex.add( bwig, set_file_descr(fname+".bw",groupId=gid,step=step,type="bigWig",ucsc='1'))
        ## add segToFrags before normalisation
        futures_merged_raw[gid].wait()
        trbedgraph = track(removeNA(processed['4cseq']['countsPerFrag_grp'][gid]),format='bedgraph')
        bwig = unique_filename_in()+".bw"
        trwig = track(bwig,chrmeta=assembly.chrmeta)
        trwig.write(trbedgraph.read(fields=['chr','start','end','score'],
                               selection={'score':(0.01,sys.maxint)}))
        trwig.close()
        fname = "segToFrag_"+job.groups[gid]['name']
        ex.add( bwig, description=set_file_descr( fname+".bw",
                                                             groupId=gid,step=step,type="bigWig",
                                                             comment="segToFrag file before normalisation" ))

    step = "norm_counts_per_frags"  # after new normalisation process, combined replicates
    for gid, resfile in processed['4cseq']['norm_grp'].iteritems():
        fname = "normalised_scorePerFeature_"+job.groups[gid]['name']
        gzipfile(ex,resfile)
        ex.add( resfile+".gz", description=set_file_descr( fname+".bedGraph.gz", groupId=gid,step=step, type="bedGraph",ucsc='1'))
    # norm files, per replicates (might be removed)
    for gid, dict_gid in processed['4cseq']['norm'].iteritems():
        for rid, resfile in dict_gid.iteritems():
            fname = "normalised_scorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid)
            gzipfile(ex,resfile)
            ex.add(resfile+".gz",
                    description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
    step = "profile_correction" # Profile corrected data, combined replicates
    for gid, profileCorrectedFile in processed['4cseq']['profileCorrection_grp'].iteritems():
        fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected"
        gzipfile(ex,profileCorrectedFile)
        ex.add( profileCorrectedFile+".gz",
                description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
    # Profile corrected, per replicate (might be removed)
    for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems():
        for rid, resfiles in dict_gid.iteritems():
    #        profileCorrectedFile = resfiles[0]
            reportProfileCorrection = resfiles[1]
            fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_rep"+str(rid)
    #        gzipfile(ex,profileCorrectedFile)
     #       ex.add( profileCorrectedFile+".gz",
      #              description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
            ex.add( reportProfileCorrection, description=set_file_descr(fname+".pdf",
                                                                    groupId=gid,step=step,type="pdf"))
    step = "smoothing"
    for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems():
        rawSmoothFile = resfiles[0]
        smoothFile = resfiles[1]
        afterProfileCorrection = resfiles[2]
        nFrags = str(job.groups[gid]['window_size'])
        ## smoothed file before normalisation
        fname = "segToFrag_"+job.groups[gid]['name']+"_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,rawSmoothFile)
        ex.add(rawSmoothFile+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
        ## smoothed file after normalisation, before Profile correction
        fname = "segToFrag_"+job.groups[gid]['name']+"_norm_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,smoothFile)
        ex.add(smoothFile+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
        ## smoothed file after normalisation, after Profile correction
        fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz"
        gzipfile(ex,afterProfileCorrection)
        ex.add(afterProfileCorrection+".gz",
               description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))

    step = "domainograms"
    for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems():
        tarFile = resfiles.pop()
        fname = job.groups[gid]['name']+"_domainogram.tar.gz"
        ex.add(tarFile, description=set_file_descr(fname,
                                                   groupId=gid,step=step,type="tgz"))
        for s in resfiles:
            if s[-8:] == "bedGraph":
                gzipfile(ex,s)
                s += ".gz"
                ex.add( s, description=set_file_descr( s, groupId=gid,step=step,type="bedGraph",ucsc="1",gdv="1"))

    step = "combined_results"
    gzipfile(ex,tableSmoothedRaw_grp)
    ex.add(tableSmoothedRaw_grp+".gz", description=set_file_descr("table_segToFrags_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableSmoothed_grp)
    ex.add(tableSmoothed_grp+".gz", description=set_file_descr("table_normalised_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableSmoothedPC_grp)
    ex.add(tableSmoothedPC_grp+".gz", description=set_file_descr("table_profileCorrected_smoothed_combined_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tablePC)
    ex.add(tablePC+".gz", description=set_file_descr("table_normalised_fit_per_replicates.txt.gz",step=step,type="txt"))

    gzipfile(ex,tableBRICKS2Frags)
    ex.add(tableBRICKS2Frags+".gz", description=set_file_descr("table_frags_in_BRICKS_combined_replicates.txt.gz",step=step,type="txt"))

    return processed
Пример #6
0
def chipseq_workflow(ex,
                     job_or_dict,
                     assembly,
                     script_path='',
                     logfile=sys.stdout,
                     via='lsf'):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict, frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not ('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files', {})
    else:
        raise TypeError(
            "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'."
        )
    merge_strands = int(options.get('merge_strands', -1))
    suffixes = ["fwd", "rev"]
    peak_deconvolution = options.get('peak_deconvolution', False)
    if isinstance(peak_deconvolution, basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't']
    run_meme = options.get('run_meme', False)
    if isinstance(run_meme, basestring):
        run_meme = run_meme.lower() in ['1', 'true', 't']
    macs_args = options.get('macs_args', ["--bw", "200"])
    b2w_args = options.get('b2w_args', [])
    if not (isinstance(mapseq_files, dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid, mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not (isinstance(mapped, dict)):
            raise TypeError(
                "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'."
            )
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name + "_" + str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking(ex,
                                                         mapped[k]["bam"],
                                                         via=via)
            if mapped[k].get('poisson_threshold', -1) > 0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns) > 0:
            p_thresh['group_name'] = sum(ptruns) / len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped) > 1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid, group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid, group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls) < 1:
        controls = [None]
        names['controls'] = [(0, None)]
    logfile.write("Starting MACS.\n")
    logfile.flush()
    processed = {
        'macs':
        add_macs_results(ex,
                         read_length,
                         genome_size,
                         tests,
                         ctrlbam=controls,
                         name=names,
                         poisson_threshold=p_thresh,
                         macs_args=macs_args,
                         via=via)
    }
    logfile.write("Done MACS.\n")
    logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
    ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score': (6, sys.maxint)}
    _fields = ['chr', 'start', 'end', 'name', 'score']
    for i, name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name, names['controls'][0])
            macsbed = track(processed['macs'][ctrl] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([
                apply(track(processed['macs'][(name, x)] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select),
                      'name',
                      lambda __n, _n=xn: "%s:%i" % (__n, _n))
                for xn, x in enumerate(names['controls'])
            ])
        ##############################
        macs_neighb = neighborhood(macsbed, before_start=150, after_end=150)
        peak_list[name] = unique_filename_in() + ".sql"
        macs_final = track(peak_list[name],
                           chrmeta=chrlist,
                           info={'datatype': 'qualitative'},
                           fields=['start', 'end', 'name', 'score'])
        macs_final.write(fusion(macs_neighb), clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(
        options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1:
        options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension'] > 100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid, mapped in mapseq_files.iteritems():
        #            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not ('wig' in m) or len(m['wig']) < 2:
                output = mapseq.parallel_density_sql(
                    ex,
                    m["bam"],
                    assembly.chrmeta,
                    nreads=m["stats"]["total"],
                    merge=-1,
                    read_extension=options['read_extension'],
                    convert=False,
                    b2w_args=b2w_args,
                    via=via)
                wig.append(dict((s, output + s + '.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict(
                (s, merge_sql(ex, [x[s] for x in wig], via=via))
                for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv(stream, pval):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream(
                ((x[0], ) + ((x[2] + x[1]) / 2 - 150,
                             (x[2] + x[1]) / 2 + 150) + x[3:]
                 for x in stream if "FERR=" in x[3]
                 and float(ferr.search(x[3]).groups()[0]) <= pval),
                fields=stream.fields)

        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1] + " deconvolution.\n")
            logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name, names['controls'][0])
                macsbed = processed['macs'][ctrl] + "_peaks.bed"
            else:
                macsbed = intersect_many_bed(ex, [
                    processed['macs'][(name, x)] + "_peaks.bed"
                    for x in names['controls']
                ],
                                             via=via)
            deconv = run_deconv(ex,
                                merged_wig[name[1]],
                                macsbed,
                                assembly.chrmeta,
                                options['read_extension'],
                                script_path,
                                via=via)
            peak_list[name] = unique_filename_in() + ".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist,
                       fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed, 0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1] + '_peaks.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1] + '_deconv.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'], (bigwig, "bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1] + '_deconv.bw',
                                                  type='bigWig',
                                                  ucsc='1',
                                                  step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e))
                logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1] + '_deconv.pdf',
                                              type='pdf',
                                              step='deconvolution',
                                              groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs(stream, xlsl, _f):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(
                            _n.split(";")[0][13:]) if _n[:3] == "ID=" else int(
                                _n[10:])
                        yield _p + xlsl[0][nb - 1][1:]
                    else:
                        nb = _n.split(
                            ";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:]

        return FeatureStream(_macs_row(stream), fields=_f)

    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,
                       chrmeta=chrlist,
                       fields=["chr", "start", "end", "name", "score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([
            processed['macs'][(name, _c)] + "_peaks.xls"
            for _c in names['controls']
        ])
        try:
            ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join([
                'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)',
                'location_type', 'distance'
            ] + _fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(
                    getNearestFeature(ptrack.read(selection=chrom), _feat),
                    xlsl, _fields),
                              mode='append')
        except ValueError:
            _fields = ['chr', 'start', 'end', 'name', 'score'
                       ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join(
                ['chromosome', 'start', 'end', 'info', 'peak_height'] +
                _fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl,
                                         _fields),
                              mode='append')
        peakout.close()
        gzipfile(ex, peakfile)
        peakfile_list.append(
            track(peakfile + ".gz", format='txt', fields=_fields))
        ex.add(peakfile + ".gz",
               description=set_file_descr(name[1] + '_annotated_peaks.txt.gz',
                                          type='text',
                                          step='annotation',
                                          groupId=name[0]))
    stracks = [
        track(wig, info={'name': name + "_" + st})
        for name, wigdict in merged_wig.iteritems()
        for st, wig in wigdict.iteritems()
    ]
    tablefile = unique_filename_in()
    with open(tablefile, "w") as _tf:
        _pnames = [
            "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1]
            for _s in names['tests'] for _c in names['controls']
        ]
        _tf.write("\t".join([
            '#chromosome',
            'start',
            'end',
        ] + _pnames + [s.name for s in stracks]) + "\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [
            apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']),
                  'name',
                  lambda __n, _n=npt: "%s:%i" % (__n, _n))
            for npt, pt in enumerate(peakfile_list)
        ]
        features = fusion(
            concatenate(pk_lst,
                        fields=['chr', 'start', 'end', 'name'],
                        remove_duplicates=True,
                        group_by=['chr', 'start', 'end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile, "a") as _tf:
            for row in quantifs:
                pcols = [''] * _ns * _nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while (_k < len(_rnsplit) - 1 - int(_nc > 1)):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k - 1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0]) * _nc + int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(
                    str(tt)
                    for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) +
                          "\n")
    gzipfile(ex, tablefile)
    ex.add(tablefile + ".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',
                                      type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n")
        logfile.flush()
        processed['meme'] = parallel_meme(
            ex,
            assembly,
            peak_list.values(),
            name=peak_list.keys(),
            chip=True,
            meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'],
            via=via)
    return processed
Пример #7
0
def c4seq_workflow(ex,
                   job,
                   primers_dict,
                   assembly,
                   c4_url=None,
                   script_path='',
                   logfile=sys.stdout,
                   via='lsf'):
    '''
    Main
    * open the 4C-seq minilims and create execution
    * 0. get/create the library
    * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql)
    * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate)
    '''

    mapseq_files = job.files
    ### outputs
    processed = {'lib': {}, 'density': {}, '4cseq': {}}
    processed['4cseq'] = {
        'density_files': {},
        'countsPerFrag': {},
        'countsPerFrag_grp': {},
        'norm': {},
        'norm_grp': {},
        'profileCorrection': {},
        'profileCorrection_grp': {},
        'smooth_grp': {},
        'domainogram_grp': {},
        'bricks2frags': {}
    }
    # was 'smoothFrag': {}, 'domainogram': {}}
    regToExclude = {}
    new_libs = []
    ### options
    run_domainogram = {}
    before_profile_correction = {}
    if not job.options.get('viewpoints_chrs', False):
        out_chromosomes = ','.join([ch for ch in assembly.chrnames])
    else:
        out_chromosomes = ','.join([
            primers_dict.get(group['name'], {}).get('baitcoord').split(':')[0]
            for gid, group in job.groups.iteritems()
        ])
    print "out_chromosomes=" + out_chromosomes + "\n"

    sizeExt = job.options.get('norm_reg', 1000000)
    print "region considered for normalisation: mid viewpoint +/-" + str(
        sizeExt) + 'bps'

    ### do it
    for gid, group in job.groups.iteritems():
        run_domainogram[gid] = group.get('run_domainogram', False)
        if isinstance(run_domainogram[gid], basestring):
            run_domainogram[gid] = (run_domainogram[gid].lower()
                                    in ['1', 'true', 'on', 't'])
        before_profile_correction[gid] = group.get('before_profile_correction',
                                                   False)
        if isinstance(before_profile_correction[gid], basestring):
            before_profile_correction[gid] = (
                before_profile_correction[gid].lower()
                in ['1', 'true', 'on', 't'])
        processed['lib'][gid] = get_libForGrp(ex,
                                              group,
                                              assembly,
                                              new_libs,
                                              gid,
                                              c4_url,
                                              via=via)
        #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed'
        processed['4cseq']['density_files'][gid] = {}
        regToExclude[gid] = primers_dict.get(group['name'],
                                             {}).get('regToExclude',
                                                     "").replace('\r', '')

        # if no regToExclude defined, set it as mid_baitCoord +/-5kb
        if len(regToExclude[gid]) == 0:
            baitcoord_mid = int(0.5 * (int(
                primers_dict.get(group['name'], {}).get('baitcoord').split(':')
                [1].split('-')[0]) + int(
                    primers_dict.get(group['name'], {}).get('baitcoord').split(
                        ':')[1].split('-')[1])))
            regToExclude[gid] = primers_dict.get(
                group['name'], {}).get('baitcoord').split(':')[0] + ':' + str(
                    baitcoord_mid - 5000) + '-' + str(baitcoord_mid + 5000)

        #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()]))
        print(primers_dict.get(group['name'], {}))
        print "regToExclude[" + str(gid) + "]=" + regToExclude[gid]
        for rid, run in group['runs'].iteritems():
            libname = mapseq_files[gid][rid]['libname']
            if job.options.get('merge_strands') != 0 or not (
                    'wig' in mapseq_files[gid][rid]):
                density_file = parallel_density_sql(
                    ex,
                    mapseq_files[gid][rid]['bam'],
                    assembly.chrmeta,
                    nreads=mapseq_files[gid][rid]['stats']["total"],
                    merge=0,
                    read_extension=mapseq_files[gid][rid]['stats']
                    ['read_length'],
                    convert=False,
                    via=via)
                density_file += "merged.sql"
                ex.add(density_file,
                       description=set_file_descr("density_file_" + libname +
                                                  ".sql",
                                                  groupId=gid,
                                                  step="density",
                                                  type="sql",
                                                  view='admin',
                                                  gdv="1"))
            else:
                density_file = mapseq_files[gid][rid]['wig']['merged']
            #density_files.append(density_file)
            processed['4cseq']['density_files'][gid][rid] = density_file

        # back to grp level!
        # not anymore:
        # processed['density'][gid] = merge_sql(ex, density_files, via=via)

    processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag(
        ex, processed, job.groups, assembly, regToExclude, script_path, via)
    ## access per gid+rid

    futures_norm = {}
    countsPerFrags_bedGraph = {}
    futures_merged_raw = {}
    for gid, group in job.groups.iteritems():
        futures_norm[gid] = {}
        countsPerFrags_bedGraph[gid] = {}
        processed['4cseq']['norm'][gid] = {}
        for rid, run in group['runs'].iteritems():
            normfile = unique_filename_in()
            touch(ex, normfile)
            resfile = unique_filename_in() + ".bedGraph"
            resfiles = processed['4cseq']['countsPerFrag'][gid][
                rid]  # _all.sql
            convert(resfiles[3], resfile)
            countsPerFrags_bedGraph[gid][rid] = resfile

            print "call normFrags: infiles=" + resfile + ", normfile=" + normfile + "baitCoord=" + primers_dict[
                group['name']][
                    'baitcoord'] + ", sizeExt=sizeExt, name=" + group[
                        'name'] + "rep_" + str(
                            rid) + "regToExclude=" + regToExclude[gid] + "\n"
            futures_norm[gid][rid] = normFrags.nonblocking(
                ex,
                resfile,
                normfile,
                baitCoord=primers_dict[group['name']]['baitcoord'],
                sizeExt=sizeExt,
                name=group['name'] + "rep_" + str(rid),
                regToExclude=regToExclude[gid],
                script_path=script_path,
                via=via)
            processed['4cseq']['norm'][gid][rid] = normfile

        if len(group) > 1:
            ## merge replicates before normalisation.
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_raw_mergedRep"
            print "gid=" + group['name']
            print "call mergeRep for replicates before normalisation: infiles=" + ",".join(
                [
                    res_rid for rid, res_rid in
                    countsPerFrags_bedGraph[gid].iteritems()
                ]
            ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_merged_raw[gid] = mergeRep.nonblocking(
                ex,
                ",".join([
                    res_rid for rid, res_rid in
                    countsPerFrags_bedGraph[gid].iteritems()
                ]),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['countsPerFrag_grp'][gid] = mergefile
        else:
            futures_merged_raw[gid] = None
            processed['4cseq']['countsPerFrag_grp'][
                gid] = countsPerFrags_bedGraph[gid][
                    0]  #if no replicates, then the file we want is the 1st one

    print "***** profile correction / sample + merge normalised data"
    futures_merged = {}  # per gid
    futures_profcor = {}  # per gid, per rid
    for gid, group in job.groups.iteritems():
        ## run profile correction per run then merge them
        futures_profcor[gid] = {}
        processed['4cseq']['profileCorrection'][gid] = {}
        for rid, run in group['runs'].iteritems():
            # wait for normalisation of all replicates to be finished
            futures_norm[gid][rid].wait(
            )  ## normalised files, per grp, per rep
            normfile = processed['4cseq']['norm'][gid][rid]
            file1 = unique_filename_in()  #track file
            touch(ex, file1)
            file2 = unique_filename_in()  #report file
            touch(ex, file2)
            file3 = unique_filename_in()  #table file
            touch(ex, file3)
            print "call profileCorrection: normfile=" + normfile + ", baitCoord=" + primers_dict[
                group['name']]['baitcoord'] + ", name=" + group[
                    'name'] + ", file1=" + file1 + ", file2=" + file2 + ", file3= " + file3 + "\n"
            futures_profcor[gid][rid] = profileCorrection.nonblocking(
                ex,
                normfile,
                primers_dict[group['name']]['baitcoord'],
                group['name'],
                file1,
                file2,
                file3,
                script_path,
                via=via)
            processed['4cseq']['profileCorrection'][gid][rid] = [
                file1, file2, file3
            ]

        ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop.
        if len(group) > 1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_norm_mergedRep"
            print "gid=" + group['name']
            print "call mergeRep: infiles=" + ",".join([
                res_rid for rid, res_rid in processed['4cseq']['norm']
                [gid].iteritems()
            ]) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_merged[gid] = mergeRep.nonblocking(
                ex,
                ",".join([
                    res_rid for rid, res_rid in processed['4cseq']['norm']
                    [gid].iteritems()
                ]),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['norm_grp'][gid] = mergefile
        else:
            futures_merged[gid] = None
            processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][
                gid][
                    0]  ##if no replicates, then the file we want is the 1st one

    print "***** merge profile corrected data"
    futures_profcor_merged = {}  # per gid
    for gid, group in job.groups.iteritems():
        processed['4cseq']['profileCorrection_grp'][gid] = {}
        for rid, run in group['runs'].iteritems():
            futures_profcor[gid][rid].wait(
            )  ## wait for ProfileCorrection to be finished

        ## merge replicates after profile correction
        if len(group) > 1:
            mergefile = unique_filename_in()
            touch(ex, mergefile)
            titleName = group['name'] + "_ProfCor_mergedRep"
            pcfiles = [
                processed['4cseq']['profileCorrection'][gid][rid][0]
                for rid, res_rid in processed['4cseq']['profileCorrection']
                [gid].iteritems()
            ]
            print "call mergeRep (for PC tables): infiles=" + ",".join(
                pcfiles
            ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[
                gid] + "\n"
            futures_profcor_merged[gid] = mergeRep.nonblocking(
                ex,
                ",".join(pcfiles),
                mergefile,
                regToExclude[gid],
                name=titleName,
                script_path=script_path,
                via=via,
                memory=8)
            processed['4cseq']['profileCorrection_grp'][gid] = mergefile
        else:
            futures_profcor_merged[gid] = None
            processed['4cseq']['profileCorrection_grp'][gid] = processed[
                '4cseq']['profileCorrection'][gid][
                    0]  ##if no replicates, then the file we want is the 1st one

    print "***** smooth data"
    futures_smoothed = {}
    for gid, group in job.groups.iteritems():
        file1 = unique_filename_in()
        touch(ex, file1)
        file2 = unique_filename_in()
        touch(ex, file2)
        file3 = unique_filename_in()
        touch(ex, file3)
        nFragsPerWin = group['window_size']
        futures_merged_raw[gid].wait(
        )  ## wait for merging of raw_grp to be completed
        futures_smoothed[gid] = (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['countsPerFrag_grp'][gid],
            nFragsPerWin,
            group['name'],
            file1,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        futures_merged[gid].wait(
        )  ## wait for merging of norm_grp to be completed
        futures_smoothed[gid] += (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['norm_grp'][gid],
            nFragsPerWin,
            group['name'] + "_norm",
            file2,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        futures_profcor_merged[gid].wait(
        )  # wait for the merging of profile corrected data to be done
        futures_smoothed[gid] += (smoothFragFile.nonblocking(
            ex,
            processed['4cseq']['profileCorrection_grp'][gid],
            nFragsPerWin,
            group['name'] + "_fromProfileCorrected",
            file3,
            regToExclude[gid],
            script_path=script_path,
            via=via,
            memory=6), )
        processed['4cseq']['smooth_grp'][gid] = [
            file1, file2, file3
        ]  #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC]

    print "***** Domainograms"
    futures_domainograms = {}
    for gid, group in job.groups.iteritems():
        grName = job.groups[gid]['name']
        if run_domainogram[gid]:
            regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord']
            if before_profile_correction[gid]:
                futures_domainograms[gid] = runDomainogram.nonblocking(
                    ex,
                    processed['4cseq']['norm_grp'][gid],
                    grName,
                    regCoord=regCoord,
                    skip=1,
                    script_path=script_path,
                    via=via,
                    memory=15)
            else:
                futures_domainograms[gid] = runDomainogram.nonblocking(
                    ex,
                    processed['4cseq']['profileCorrection_grp'][gid],
                    grName,
                    regCoord=regCoord.split(':')[0],
                    skip=1,
                    script_path=script_path,
                    via=via,
                    memory=15)

    ## prepare tar files for domainogram results (if any)
    ## and create "BRICKS to frags" files
    print "***** BRICKS to Frags"
    futures_BRICKS2Frags = {}
    for gid, f in futures_domainograms.iteritems():
        if run_domainogram[gid]:  # if domainogram has been run
            resFiles = []
            logFile = f.wait()
            start = False
            tarname = job.groups[gid]['name'] + "_domainogram.tar.gz"
            res_tar = tarfile.open(tarname, "w:gz")
            futures_BRICKS2Frags[gid] = []
            processed['4cseq']['bricks2frags'][gid] = []
            if logFile is None: continue
            with open(logFile) as f:
                for s in f:
                    s = s.strip()
                    if '####resfiles####' in s:
                        start = True
                    elif start and "RData" not in s:
                        resFiles.append(s)
                        res_tar.add(s)
                    if start and "foundBRICKS" in s:
                        bricks2fragsfile = unique_filename_in() + ".bedGraph"
                        touch(ex, bricks2fragsfile)
                        futures_BRICKS2Frags[gid] += [
                            BRICKSToFrag.nonblocking(
                                ex,
                                s,
                                processed['4cseq']['norm_grp'][gid],
                                bricks2fragsfile,
                                script_path=script_path,
                                via=via,
                                memory=4)
                        ]
                        processed['4cseq']['bricks2frags'][gid] += [
                            bricks2fragsfile
                        ]
            res_tar.close()
            processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname]

############### prepare tables for global results
    print "***** combine results into tables "
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        for rid, run in group['runs'].iteritems():
            allNames += [
                group['name'] + "_rep" + str(rid) + "_norm",
                group['name'] + "_rep" + str(rid) + "_fit"
            ]
            allFiles += [processed['4cseq']['profileCorrection'][gid][rid][2]]
            allRegToExclude += [regToExclude[gid]]
    tablePC = unique_filename_in() + ".txt"
    print("***will call makeTable with:")
    print(",".join(allFiles))
    print("resfile=" + tablePC)
    print(",".join(allNames))
    touch(ex, tablePC)

    #regToExclude[gid]

    futures_tables = (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tablePC,
        ",".join(allNames),
        idCols="4,5",
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    # wait for all smoothing to be done
    for gid, fg in futures_smoothed.iteritems():
        for f in fg:
            f.wait()

    ## make Table raw/smoothed_raw
    print("** make Table raw/smoothed_raw")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        futures_merged_raw[gid].wait()
        allNames += [group['name'] + "_raw", group['name'] + "_rawSmoothed"]
        allFiles += [
            processed['4cseq']['countsPerFrag_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][0]
        ]
        allRegToExclude += ['NA', regToExclude[gid]]

    tableSmoothedRaw_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothedRaw_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothedRaw_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## make Table norm/smoothed_norm before PC
    print("** make Table norm/smoothed_norm befor PC")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        allNames += [group['name'] + "_norm", group['name'] + "_smoothed"]
        allFiles += [
            processed['4cseq']['norm_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][1]
        ]
        allRegToExclude += [regToExclude[gid], regToExclude[gid]]

    tableSmoothed_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothed_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothed_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## make Table norm/smoothed_norm after PC
    print("** make Table norm/smoothed_norm after PC")
    allNames = []
    allFiles = []
    allRegToExclude = []
    for gid, group in job.groups.iteritems():
        allNames += [group['name'] + "_normPC", group['name'] + "_smoothedPC"]
        allFiles += [
            processed['4cseq']['profileCorrection_grp'][gid],
            processed['4cseq']['smooth_grp'][gid][2]
        ]
        allRegToExclude += [regToExclude[gid], regToExclude[gid]]

    tableSmoothedPC_grp = unique_filename_in() + ".txt"
    touch(ex, tableSmoothedPC_grp)
    futures_tables += (makeTable.nonblocking(
        ex,
        ",".join(allFiles),
        tableSmoothedPC_grp,
        ",".join(allNames),
        idCols="4",
        out_chromosomes=out_chromosomes,
        all_regToExclude=','.join(allRegToExclude),
        script_path=script_path,
        via=via,
        memory=8), )

    ## combine BRICKS2Frags files
    allNames = []
    allFiles = []
    for gid, fg in futures_BRICKS2Frags.iteritems():
        for f in fg:
            f.wait()
        allNames += [job.groups[gid]['name'] + "_BRICKSpval"]
        cat_bricks2frags = unique_filename_in() + ".txt"
        print ','.join(processed['4cseq']['bricks2frags'][gid])
        cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],
                               out=cat_bricks2frags)
        allFiles += [cat_bricks2frags]

    for gid, fg in futures_smoothed.iteritems():
        for f in fg:
            f.wait()

    tableBRICKS2Frags = unique_filename_in() + ".txt"
    touch(ex, tableBRICKS2Frags)
    futures_tables += (makeTable.nonblocking(ex,
                                             ",".join(allFiles),
                                             tableBRICKS2Frags,
                                             ",".join(allNames),
                                             idCols="4",
                                             out_chromosomes=out_chromosomes,
                                             defVal="NA",
                                             script_path=script_path,
                                             via=via,
                                             memory=8), )

    for f in futures_tables:
        f.wait()

    ################ Add everything to minilims below!
    step = "density"
    for gid in processed['4cseq']['density_files'].keys():
        for rid, sql in processed['4cseq']['density_files'][gid].iteritems():
            fname = "density_file_" + job.groups[gid][
                'name'] + "_merged_rep" + str(rid)
            ex.add(sql,
                   description=set_file_descr(fname + ".sql",
                                              groupId=gid,
                                              step=step,
                                              type="sql",
                                              gdv="1"))
            wig = unique_filename_in() + ".bw"
            convert(sql, wig)
            ex.add(wig,
                   description=set_file_descr(fname + ".bw",
                                              groupId=gid,
                                              step=step,
                                              type="bigWig",
                                              ucsc="1"))
    step = "counts_per_frag"  #was _norm_counts_per_frags # before normalisation process, per replicate
    for gid in processed['4cseq']['countsPerFrag'].keys():
        for rid, resfiles in processed['4cseq']['countsPerFrag'][
                gid].iteritems():
            fname = "meanScorePerFeature_" + job.groups[gid][
                'name'] + "_rep" + str(rid)
            ex.add(resfiles[1],
                   description=set_file_descr(fname + ".sql",
                                              groupId=gid,
                                              step=step,
                                              type="sql",
                                              view="admin",
                                              gdv='1'))
            #gzipfile(ex,resfiles[0])
            #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz",
            #                                                       groupId=gid,step=step,type="bed",view="admin" ))
            fname = "segToFrag_" + job.groups[gid]['name'] + "_rep" + str(rid)
            ex.add(resfiles[3],
                   description=set_file_descr(
                       fname + "_all.sql",
                       groupId=gid,
                       step=step,
                       type="sql",
                       comment="all informative frags - null included"))
            trsql = track(resfiles[3])
            bwig = unique_filename_in() + ".bw"
            trwig = track(bwig, chrmeta=trsql.chrmeta)
            trwig.write(
                trsql.read(fields=['chr', 'start', 'end', 'score'],
                           selection={'score': (0.01, sys.maxint)}))
            trwig.close()
            ex.add(
                bwig,
                set_file_descr(fname + ".bw",
                               groupId=gid,
                               step=step,
                               type="bigWig",
                               ucsc='1'))
        ## add segToFrags before normalisation
        futures_merged_raw[gid].wait()
        trbedgraph = track(removeNA(
            processed['4cseq']['countsPerFrag_grp'][gid]),
                           format='bedgraph')
        bwig = unique_filename_in() + ".bw"
        trwig = track(bwig, chrmeta=assembly.chrmeta)
        trwig.write(
            trbedgraph.read(fields=['chr', 'start', 'end', 'score'],
                            selection={'score': (0.01, sys.maxint)}))
        trwig.close()
        fname = "segToFrag_" + job.groups[gid]['name']
        ex.add(bwig,
               description=set_file_descr(
                   fname + ".bw",
                   groupId=gid,
                   step=step,
                   type="bigWig",
                   comment="segToFrag file before normalisation"))

    step = "norm_counts_per_frags"  # after new normalisation process, combined replicates
    for gid, resfile in processed['4cseq']['norm_grp'].iteritems():
        fname = "normalised_scorePerFeature_" + job.groups[gid]['name']
        gzipfile(ex, resfile)
        ex.add(resfile + ".gz",
               description=set_file_descr(fname + ".bedGraph.gz",
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1'))
    # norm files, per replicates (might be removed)
    for gid, dict_gid in processed['4cseq']['norm'].iteritems():
        for rid, resfile in dict_gid.iteritems():
            fname = "normalised_scorePerFeature_" + job.groups[gid][
                'name'] + "_rep" + str(rid)
            gzipfile(ex, resfile)
            ex.add(resfile + ".gz",
                   description=set_file_descr(fname + ".bedGraph.gz",
                                              groupId=gid,
                                              step=step,
                                              type="bedGraph",
                                              ucsc='1',
                                              gdv='1'))
    step = "profile_correction"  # Profile corrected data, combined replicates
    for gid, profileCorrectedFile in processed['4cseq'][
            'profileCorrection_grp'].iteritems():
        fname = "segToFrag_" + job.groups[gid]['name'] + "_profileCorrected"
        gzipfile(ex, profileCorrectedFile)
        ex.add(profileCorrectedFile + ".gz",
               description=set_file_descr(fname + ".bedGraph.gz",
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
    # Profile corrected, per replicate (might be removed)
    for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems():
        for rid, resfiles in dict_gid.iteritems():
            #        profileCorrectedFile = resfiles[0]
            reportProfileCorrection = resfiles[1]
            fname = "segToFrag_" + job.groups[gid][
                'name'] + "_profileCorrected_rep" + str(rid)
            #        gzipfile(ex,profileCorrectedFile)
            #       ex.add( profileCorrectedFile+".gz",
            #              description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1'))
            ex.add(reportProfileCorrection,
                   description=set_file_descr(fname + ".pdf",
                                              groupId=gid,
                                              step=step,
                                              type="pdf"))
    step = "smoothing"
    for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems():
        rawSmoothFile = resfiles[0]
        smoothFile = resfiles[1]
        afterProfileCorrection = resfiles[2]
        nFrags = str(job.groups[gid]['window_size'])
        ## smoothed file before normalisation
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, rawSmoothFile)
        ex.add(rawSmoothFile + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
        ## smoothed file after normalisation, before Profile correction
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_norm_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, smoothFile)
        ex.add(smoothFile + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))
        ## smoothed file after normalisation, after Profile correction
        fname = "segToFrag_" + job.groups[gid][
            'name'] + "_profileCorrected_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz"
        gzipfile(ex, afterProfileCorrection)
        ex.add(afterProfileCorrection + ".gz",
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="bedGraph",
                                          ucsc='1',
                                          gdv='1'))

    step = "domainograms"
    for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems():
        tarFile = resfiles.pop()
        fname = job.groups[gid]['name'] + "_domainogram.tar.gz"
        ex.add(tarFile,
               description=set_file_descr(fname,
                                          groupId=gid,
                                          step=step,
                                          type="tgz"))
        for s in resfiles:
            if s[-8:] == "bedGraph":
                gzipfile(ex, s)
                s += ".gz"
                ex.add(s,
                       description=set_file_descr(s,
                                                  groupId=gid,
                                                  step=step,
                                                  type="bedGraph",
                                                  ucsc="1",
                                                  gdv="1"))

    step = "combined_results"
    gzipfile(ex, tableSmoothedRaw_grp)
    ex.add(tableSmoothedRaw_grp + ".gz",
           description=set_file_descr(
               "table_segToFrags_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableSmoothed_grp)
    ex.add(tableSmoothed_grp + ".gz",
           description=set_file_descr(
               "table_normalised_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableSmoothedPC_grp)
    ex.add(tableSmoothedPC_grp + ".gz",
           description=set_file_descr(
               "table_profileCorrected_smoothed_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tablePC)
    ex.add(tablePC + ".gz",
           description=set_file_descr(
               "table_normalised_fit_per_replicates.txt.gz",
               step=step,
               type="txt"))

    gzipfile(ex, tableBRICKS2Frags)
    ex.add(tableBRICKS2Frags + ".gz",
           description=set_file_descr(
               "table_frags_in_BRICKS_combined_replicates.txt.gz",
               step=step,
               type="txt"))

    return processed
Пример #8
0
def parallel_meme( ex, assembly, regions, name=None, chip=False, meme_args=None, via='lsf' ):
    """Fetches sequences, then calls ``meme`` on them and finally saves the results in the repository.
    
    """
    if meme_args is None: meme_args = []
    if not(isinstance(regions,list)): regions = [regions]
    if not(isinstance(name,list)): name = [name or '_']
    futures = {}
    fasta_files = {}
    background = assembly.statistics(unique_filename_in(),frequency=True)
#    genomeRef = assembly.untar_genome_fasta()
    for i,n in enumerate(name):
        (fasta, size) = assembly.fasta_from_regions( regions[i], ex=ex )
        tmpfile = unique_filename_in()
        outdir = unique_filename_in()
        if chip:
            futures[n] = (outdir, memechip.nonblocking( ex, fasta, outdir, background,
                                                        args=meme_args, via=via, 
                                                        stderr=tmpfile, memory=6 ))
        else:
            futures[n] = (outdir, meme.nonblocking( ex, fasta, outdir, background,
                                                    maxsize=(size*3)/2, args=meme_args,
                                                    via=via, stderr=tmpfile, memory=6 ))
        fasta_files[n] = fasta
    all_res = {}
    for n,f in futures.iteritems():
        f[1].wait()
        meme_out = f[0]
        archive = unique_filename_in()
        tgz = tarfile.open(archive, "w:gz")
        tgz.add( meme_out, arcname=n[1]+"_meme",
                 exclude=lambda x: os.path.basename(x) in [fasta_files[n],background] )
        tgz.close()
        ex.add( archive, description=set_file_descr(n[1]+"_meme.tgz",
                                                    step='meme', type='tar',
                                                    groupId=n[0]) )
        gzipfile(ex,fasta_files[n],args=["-f"])
        ex.add( fasta_files[n]+".gz",
                description=set_file_descr(n[1]+"_sites.fa.gz",
                                           step='meme', type='fasta',
                                           groupId=n[0]) )
        if not(chip) and os.path.exists(os.path.join(meme_out, "meme.xml")):
            meme_res = parse_meme_xml( ex, os.path.join(meme_out, "meme.xml"),
                                       assembly.chrmeta )
            if os.path.exists(os.path.join(meme_out, "meme.html")):
                ex.add( os.path.join(meme_out, "meme.html"),
                        description=set_file_descr(n[1]+"_meme.html",
                                                   step='meme', type='html', 
                                                   groupId=n[0]) )
            ex.add( meme_res['sql'], description=set_file_descr(n[1]+"_meme_sites.sql",
                                                                step='meme', type='sql',
                                                                groupId=n[0]) )
            for i,motif in enumerate(meme_res['matrices'].keys()):
                ex.add( meme_res['matrices'][motif],
                        description=set_file_descr(n[1]+"_meme_"+motif+".txt",
                                                   step='meme', type='txt', 
                                                   groupId=n[0]) )
                ex.add( os.path.join(meme_out, "logo"+str(i+1)+".png"),
                        description=set_file_descr(n[1]+"_meme_"+motif+".png",
                                                   step='meme', type='png', 
                                                   groupId=n[0]) )
            all_res[n] = meme_res
    return all_res