Exemplo n.º 1
0
def parse_fragFile(fragfile,chrom_dict={}):
    """
    Parse fragment file to create segment info bed file and fragment bed file
    """
    segInfoBedFile = unique_filename_in()
    fragmentBedFile = unique_filename_in()
    o = open(segInfoBedFile,'w')
    obed = open(fragmentBedFile,'w')
    with open(fragfile,'r') as f:
        s = f.next()
        for s in f:
            if re.search('FragIsNotValid',s): continue
            s = s.strip().split('\t')
            chrom = chrom_dict.get(s[1],s[1])
            fragmentInfo = '|'.join(['',s[0],chrom+':'+str(int(s[2])+1)+'-'+s[3],
                                     'indexOfSecondRestSiteOcc='+s[10],
                                     'status='+s[-1],'length='+str(int(s[3])-int(s[2])),
                                     '0','0','0','0'])
            o.write('\t'.join([chrom,s[5],s[6],'type=startSegment'+fragmentInfo])+'\n')
            o.write('\t'.join([chrom,s[8],s[9],'type=endSegment'+fragmentInfo])+'\n')
            row = [chrom,s[2],s[3],'frag'+s[0]]
            obed.write('\t'.join(row)+'\n')
            row[1:3] = [s[5],s[6]]
            obed.write('\t'.join(row)+'_startSeq\n')
            row[1:3] = [s[8],s[9]]
            obed.write('\t'.join(row)+'_endSeq\n')
    o.close()
    obed.close()
    return([segInfoBedFile,fragmentBedFile])
Exemplo n.º 2
0
def motif_scan( ex, bedlist, assembly, groups, via, logfile ):
    logfile.write("Scanning motifs\n");logfile.flush()
    motifbeds = {}
    supdir = os.path.split(ex.remote_working_directory)[0]
    for gid,bedfile in bedlist.iteritems():
        logfile.write("\n%i: "%gid);logfile.flush()
        group = groups[gid]
        motifs = {}
        for mot in group.get('motif',[]):
            if os.path.exists(mot):
                mname = os.path.basename(os.path.splitext(mot)[0])
                motifs[mname] = mot
            elif os.path.exists(os.path.join(supdir,mot)):
                mname = os.path.basename(os.path.splitext(mot)[0])
                motifs[mname] = os.path.join(supdir,mot)
            else:
                _gnid, mname = mot.split(' ')
                motifs[mname] = _gnrp.get_motif_PWM(int(_gnid), mname, output=unique_filename_in())
            logfile.write(mname+", ");logfile.flush()
        _descr = set_file_descr(group['name']+'_motifs.bed',
                                type='bed', ucsc='1', step='motifs', groupId=gid)
        _out = unique_filename_in()
        _hd = "track name='%s_motifs'" %group['name']
        motifbeds[gid] = save_motif_profile( ex, motifs, assembly, bedfile,
                                             keep_max_only=True, output=_out,
                                             header=_hd, description=_descr, via=via )
    return motifbeds
Exemplo n.º 3
0
def plot_footprint_profile(ex, bedlist, signals, chrnames, groups, logfile):
    files = dict((gid, {'pdf': "", 'mat': []}) for gid in bedlist.keys())
    logfile.write("Plotting footprints:\n")
    logfile.flush()
    for gid, motifbed in bedlist.iteritems():
        #        signals = [track(sig) for sig in siglist[gid]]
        snames = [sig.name for sig in signals[gid]]
        tmotif = track(motifbed, format='bed')
        data = {}
        numregs = {}
        for chrom in chrnames:
            fread = {}
            for r in tmotif.read(chrom):
                r2 = r[3].split(":")
                key = (r2[0], len(r2[1]))
                if key in fread: fread[key].append(r[1:3])
                else: fread[key] = [r[1:3]]
            for motif, regs in fread.iteritems():
                if motif not in data:
                    data[motif] = zeros(shape=(motif[1] + 2 * _plot_flank[1],
                                               len(signals[gid])))
                    numregs[motif] = 0
                numregs[motif] += len(regs)
                tFeat = sorted_stream(
                    segment_features(FeatureStream(regs,
                                                   fields=['start', 'end']),
                                     nbins=motif[1],
                                     upstream=_plot_flank,
                                     downstream=_plot_flank))
                for t in score_by_feature(
                    [s.read(chrom) for s in signals[gid]], tFeat):
                    data[motif][t[2]] += t[3:]
        files[gid]['pdf'] = unique_filename_in()
        new = True
        last = len(data)
        for motif, dat in data.iteritems():
            last -= 1
            mname, nbins = motif
            dat /= float(numregs[motif])
            X = range(-_plot_flank[1], _plot_flank[1] + nbins)
            for k in range(nbins):
                X[k + _plot_flank[1]] = str(k + 1)
            ####### Could do a heatmap (sort by intensity)...
            lineplot(X, [dat[:, n] for n in range(dat.shape[-1])],
                     mfrow=[4, 2],
                     output=files[gid]['pdf'],
                     new=new,
                     last=(last == 0),
                     legend=snames,
                     main=mname)
            new = False
            _datf = unique_filename_in()
            with open(_datf, "w") as dff:
                dff.write("\t".join([""] + [str(x) for x in X]) + "\n")
                for n, sn in enumerate(snames):
                    dff.write("\t".join([sn] + [str(x)
                                                for x in dat[:, n]]) + "\n")
            files[gid]['mat'].append((mname, _datf))
    return files
Exemplo n.º 4
0
def wellington( bed, bam, output=None, options=[] ):
    """
    Binds the ``wellington_footprints.py`` program: `<http://pythonhosted.org/pyDNase/scripts.html#wellington-footprints-py>`_. 
    """
    if output is None: output = unique_filename_in()
    outdir = unique_filename_in()
    os.mkdir(outdir)
    args = ["wellington_footprints.py","-o",output]
    return {'arguments': args+options+[bed,bam,outdir], 'return_value': (outdir,output)}
Exemplo n.º 5
0
def FDR_threshold( ex, motif, background, assembly, regions, alpha=.1, nb_samples=1, via='lsf' ):
    """
    Computes a score threshold for 'motif' on 'regions' based on a false discovery rate < alpha and returns the
    threshold or a dictionary with keys thresholds and values simulated FDRs when alpha < 0.
    """
    fasta, size = assembly.fasta_from_regions( regions, ex=ex )
    shuf_fasta, shuf_size = assembly.fasta_from_regions( regions, shuffled=True, ex=ex )
    output = unique_filename_in()
#### Threshold at -100 to get all scores!
    future = motif_scan.nonblocking( ex, fasta, motif, background, -100, stdout=output, via=via )
    shuf_futures = {}
    for i in range(nb_samples):
        out = unique_filename_in()
        shuf_futures[out] = motif_scan.nonblocking( ex, shuf_fasta, motif, background, -100, stdout=out, via=via )
    _ = future.wait()
    TP_scores = {}
    ntp = 0
    with open(output, 'r') as fin:
        for line in fin:
            row = line.split("\t")
            score = int(round(float(row[2])))
            if score in TP_scores:
                TP_scores[score] += 1
            else:
                TP_scores[score] = 1
            ntp += 1
    scores = sorted(TP_scores.keys(),reverse=True)
    scores = [scores[0]+1]+scores+[-101]
    FP_scores = dict((k,0) for k in scores)
    nfp = 0
    for file,fut in shuf_futures.iteritems():
        _ = fut.wait()
        with open(file, 'r') as fin:
            for line in fin:
                row = line.split("\t")
                fscore = int(round(float(row[2])))
                tscore = max([k for k in scores if k<=fscore])
                FP_scores[tscore] += 1
                nfp += 1
    TP_scores[scores[-1]] = ntp
    TP_scores[scores[0]] = 0
    FP_scores[scores[-1]] = nfp
    for i,sc in enumerate(scores[1:-1]):
        TP_scores[sc] = TP_scores[scores[i]]+TP_scores[sc]
        FP_scores[sc] = FP_scores[scores[i]]+FP_scores[sc]
    cur_fdr = 1.0
    threshold = scores[0]
    for k in sorted(FP_scores.keys()):
        if TP_scores[k] > 0 and FP_scores[k]/float(TP_scores[k]) < cur_fdr:
            cur_fdr = FP_scores[k]/float(TP_scores[k])
            if cur_fdr <= alpha:
                threshold = k
                break
        FP_scores[k] = cur_fdr
    if alpha < 0: return FP_scores
    return threshold
Exemplo n.º 6
0
def wellington(bed, bam, output=None, options=[]):
    """
    Binds the ``wellington_footprints.py`` program: `<http://pythonhosted.org/pyDNase/scripts.html#wellington-footprints-py>`_.
    """
    if output is None: output = unique_filename_in()
    outdir = unique_filename_in()
    os.mkdir(outdir)
    args = ["wellington_footprints.py", "-o", output]
    return {
        'arguments': args + options + [bed, bam, outdir],
        'return_value': (outdir, output)
    }
Exemplo n.º 7
0
def getRestEnzymeOccAndSeq(fasta_file, prim_site, sec_site, l_seg, l_type='typeI'):
    """
    Creates segments and fragments files of the new library from the genome sequence
    (via a call to getRestEnzymeOccAndSeq.pl).
    """
    segFile = unique_filename_in()
    fragFile = unique_filename_in()
    logFile = unique_filename_in()
#	script_path='/archive/epfl/bbcf/mleleu/pipeline_vMarion/pipeline_3Cseq/vWebServer_SAM/'
    progname = (l_type=='typeI') and "getRestEnzymeOccAndSeq.pl" or "getRestEnzymeOccAndSeq_typeII.pl"
    options = ["-i",fasta_file,"-m",prim_site,"-s",sec_site,
               "-l",l_seg,"-o",segFile,"-f",fragFile,"-x",logFile]
    return {'arguments': [progname]+options, 'return_value': [ segFile, fragFile, logFile ]}
Exemplo n.º 8
0
        def soapsplice(unmapped_R1, unmapped_R2, index, output=None, path_to_soapsplice=None, options={}):
            """Bind 'soapsplice'. Return a text file containing the list of junctions.

            :param unmapped_R1: (str) path to the fastq file containing the 'left' reads.
            :param unmapped_R2: (str) path to the fastq file containing the 'right' reads.
            :param index: (str) path to the SOAPsplice index.
            :param output: (str) output file name.
            :param path_to_soapsplice: (str) path to the SOAPsplice executable.
                If not specified, the program must be in your $PATH.
            :param options: (dict) SOAPsplice options, given as {opt: value}.
            :rtype: str

            Main options::

            -p: number of threads, <= 20. [1]
            -S: 1: forward strand, 2: reverse strand, 3: both. [3]
            -m: maximum mismatch for one-segment alignment, <= 5. [3]
            -g: maximum indel for one-segment alignment, <= 2. [2]
            -i: length of tail that can be ignored in one-segment alignment. [7]
            -t: longest gap between two segments in two-segment alignment. [500000]
            -a: shortest length of a segment in two-segment alignment. [8]
            -q: input quality type in FASTQ file (0: old Illumina, 1: Sanger). [0]
            -L: maximum distance between paired-end reads. [500000]
            -l: minimum distance between paired-end reads. [50]
            -I: insert length of paired-end reads.
            """
            if not output: output = unique_filename_in()
            path_to_soapsplice = path_to_soapsplice or 'soapsplice'
            args = [path_to_soapsplice,'-d',index,'-1',unmapped_R1,'-2',unmapped_R2,'-o',output,'-f','2']
            opts = []
            for k,v in options.iteritems(): opts.extend([str(k),str(v)])
            return {"arguments": args+opts, "return_value": output}
Exemplo n.º 9
0
def create_tracks(ex, outall, sample_names, assembly):
    """Write BED tracks showing SNPs found in each sample."""
    infields = ['chromosome','position','reference']+sample_names+['gene','location_type','distance']
    intrack = track(outall, format='text', fields=infields, chrmeta=assembly.chrmeta,
                    intypes={'position':int})
    instream = intrack.read(fields=infields[:-3])
    outtracks = {}
    for sample_name in sample_names:
        out = unique_filename_in()+'.bed.gz'
        t = track(out,fields=['name'])
        t.make_header(name=sample_name+"_SNPs")
        outtracks[sample_name] = (t,out)

    def _row_to_annot(x,ref,n):
        if x[3+n][0] == ref: return None
        else: return "%s>%s"%(ref,x[3+n][0])

    for x in instream:
        coord = (x[0],x[1]-1,x[1])
        ref = x[2]
        snp = dict((name, _row_to_annot(x,ref,n)) for n,name in enumerate(sample_names))
        for name, tr in outtracks.iteritems():
            if snp[name]: tr[0].write([coord+(snp[name],)],mode='append')
    for name, tr in outtracks.iteritems():
        tr[0].close()
        description = set_file_descr(name+"_SNPs.bed.gz",type='bed',step='tracks',gdv='1',ucsc='1')
        ex.add(tr[1], description=description)
Exemplo n.º 10
0
 def run_DE(data_file):
     """Run limma.R on *data_file*."""
     output_file = unique_filename_in()
     arguments = [
         "limma.R", data_file, "-s", "$'\t'", "-o", output_file
     ]
     return {'arguments': arguments, 'return_value': output_file}
Exemplo n.º 11
0
def run_wellington(ex, tests, names, assembly, via, logfile):
    futures = {}
    logfile.write("Running Wellington:\n")
    logfile.flush()
    wellout = {}
    for nbam, bed_bam in enumerate(tests):
        name = names['tests'][nbam]
        wellout[name] = []
        tbed = track(bed_bam[0])
        for chrom in assembly.chrnames:
            _chrombed = unique_filename_in()
            with track(_chrombed, format="bed", fields=tbed.fields) as _tt:
                if len(bed_bam) > 2:
                    _neighb = neighborhood(tbed.read(chrom),
                                           before_start=bed_bam[2],
                                           after_end=bed_bam[2])
                else:
                    _neighb = tbed.read(chrom)
                _tt.write(fusion(_neighb), clip=True)
            if os.path.getsize(_chrombed) > 0:
                futures[(chrom, name)] = wellington.nonblocking(ex,
                                                                _chrombed,
                                                                bed_bam[1],
                                                                via=via,
                                                                memory=8)
    for chro_name, _fut in futures.iteritems():
        chrom, name = chro_name
        logfile.write(name[1] + " " + chrom + ", ")
        logfile.flush()
        wellout[name].append(_fut.wait())
    logfile.write("\n")
    logfile.flush()
    bedlist = save_wellington(ex, wellout, assembly.chrmeta)
    return bedlist
Exemplo n.º 12
0
def run_wellington( ex, tests, names, assembly, via, logfile ):
    futures = {}
    logfile.write("Running Wellington:\n");logfile.flush()
    wellout = {}
    for nbam,bed_bam in enumerate(tests):
        name = names['tests'][nbam]
        wellout[name] = []
        tbed = track(bed_bam[0])
        for chrom in assembly.chrnames:
            _chrombed = unique_filename_in()
            with track(_chrombed,format="bed",fields=tbed.fields) as _tt:
                if len(bed_bam) > 2:
                    _neighb = neighborhood( tbed.read(chrom), before_start=bed_bam[2], after_end=bed_bam[2] )
                else:
                    _neighb = tbed.read(chrom)
                _tt.write(fusion(_neighb),clip=True)
            if os.path.getsize(_chrombed) > 0:
                futures[(chrom,name)] = wellington.nonblocking(ex, _chrombed, bed_bam[1], via=via, memory=8)
    for chro_name, _fut in futures.iteritems():
        chrom, name = chro_name
        logfile.write(name[1]+" "+chrom+", ");logfile.flush()
        wellout[name].append(_fut.wait())
    logfile.write("\n");logfile.flush()
    bedlist = save_wellington(ex, wellout, assembly.chrmeta)
    return bedlist
Exemplo n.º 13
0
def _begin(output, format, new, ratio=1.375, **kwargs):
    """Initializes the plot in *R*."""
    if new:
        if output is None:
            output = unique_filename_in()
        if format == 'pdf':
            robjects.r('pdf("%s",paper="a4",height=8*%f,width=8)' %
                       (output, ratio))
        elif format == 'png':
            robjects.r('png("%s",height=800*%f,width=800,type="cairo")' %
                       (output, ratio))
        else:
            raise ValueError("Format not supported: %s" % format)
        pars = "lwd=2,cex=1.1,cex.main=1.5,cex.lab=1.3,cex.axis=1.1,mar=c(4,4,1,1),las=1,pch=20"
        if len(kwargs.get('mfrow', [])) == 2:
            pars += ",mfrow=c(%i,%i)" % tuple(kwargs['mfrow'])
        robjects.r('par(%s)' % pars)
    opts = ''
    if 'log' in kwargs: opts += ',log="%s"' % kwargs['log']
    if 'xlim' in kwargs: opts += ',xlim=c(%f,%f)' % tuple(kwargs['xlim'])
    if 'ylim' in kwargs: opts += ',ylim=c(%f,%f)' % tuple(kwargs['ylim'])
    opts += ',main="%s"' % kwargs.get('main', '')
    opts += ',xlab="%s"' % kwargs.get('xlab', '')
    opts += ',ylab="%s"' % kwargs.get('ylab', '')
    return opts, output
Exemplo n.º 14
0
def plot_footprint_profile( ex, bedlist, signals, chrnames, groups, logfile ):
    files = dict((gid,{'pdf':"",'mat':[]}) for gid in bedlist.keys())
    logfile.write("Plotting footprints:\n");logfile.flush()
    for gid, motifbed in bedlist.iteritems():
#        signals = [track(sig) for sig in siglist[gid]]
        snames = [sig.name for sig in signals[gid]]
        tmotif = track(motifbed,format='bed')
        data = {}
        numregs = {}
        for chrom in chrnames:
            fread = {}
            for r in tmotif.read(chrom):
                r2 = r[3].split(":")
                key = (r2[0],len(r2[1]))
                if key in fread: fread[key].append(r[1:3])
                else: fread[key] = [r[1:3]]
            for motif, regs in fread.iteritems():
                if motif not in data:
                    data[motif] = zeros(shape=(motif[1]+2*_plot_flank[1], len(signals[gid])))
                    numregs[motif] = 0
                numregs[motif] += len(regs)
                tFeat = sorted_stream(segment_features(FeatureStream(regs,fields=['start','end']),
                                                       nbins=motif[1],upstream=_plot_flank,downstream=_plot_flank))
                for t in score_by_feature([s.read(chrom) for s in signals[gid]], tFeat): 
                    data[motif][t[2]] += t[3:]
        files[gid]['pdf'] = unique_filename_in()
        new = True
        last = len(data)
        for motif, dat in data.iteritems():
            last -= 1
            mname, nbins = motif
            dat /= float(numregs[motif])
            X = range(-_plot_flank[1],_plot_flank[1]+nbins)
            for k in range(nbins): X[k+_plot_flank[1]] = str(k+1)
####### Could do a heatmap (sort by intensity)...
            lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4,2],
                     output=files[gid]['pdf'], new=new, last=(last==0), 
                     legend=snames, main=mname)
            new = False
            _datf = unique_filename_in()
            with open(_datf,"w") as dff:
                dff.write("\t".join([""]+[str(x) for x in X])+"\n")
                for n,sn in enumerate(snames):
                    dff.write("\t".join([sn]+[str(x) for x in dat[:, n]])+"\n")
            files[gid]['mat'].append((mname,_datf))
    return files
Exemplo n.º 15
0
def removeNA( fileToClean ):
    ''' remove NA present in the 4th column of a file '''
    ''' mainly used with bedgraph'''
    fileNoNA = unique_filename_in()
    resfile = open(fileNoNA, 'w')
    with open( fileToClean ) as f:
        for s in f:
            if s[0:5] == 'track': resfile.write(s)
            if s[0:5] != 'track' and s.strip().split('\t')[3] != "NA": resfile.write(s)
    resfile.close()
    return fileNoNA
Exemplo n.º 16
0
def run_microbiome(options=[], output=None):
    if output is None: output = unique_filename_in()
    options = [
        ",".join([str(x)
                  for x in o]) if isinstance(o, (list, tuple)) else str(o)
        for o in options
    ]
    return {
        'arguments': ["run_microbiome.py"] + options + [output],
        'return_value': output
    }
Exemplo n.º 17
0
def coverageInRepeats(ex, infile, genomeName='mm9', repeatsPath=GlobalRepbasePath,
                      outdir=None, via='lsf'):
    """
    Completes the segment info bed file with the coverage in repeats of each segment.
    For now, works only for mm9, hg19 and dm3.
    """
    if not(isinstance(infile,dict)):
        infile = {"":infile}
    if outdir is None:
        resfile = unique_filename_in()+".bed"
        outf = open(resfile,'w')
    repeatsFile = os.path.join(repeatsPath, genomeName, genomeName+'_rmsk.bed')
    if not(os.path.exists(repeatsFile)):
        print("coverage in repeats not calculated as file "+repeatsFile+" does not exist.")
        if outdir is None:
            outf.close()
            cat([inf[0] for inf in infile.values()],out=resfile)
        else:
            for chrom,inf in infile.iteritems():
                shutil.copy(inf[0], os.path.join(outdir,chrom+".bed"))
            resfile = outdir
        return resfile
    futures = {}
    for chrom,inf in infile.iteritems():
        tmpfile = unique_filename_in()
        futures[chrom] = (tmpfile,coverageBed.nonblocking(ex,repeatsFile,inf[0],via=via,stdout=tmpfile))
    for chrom,fut in futures.iteritems():
        if not(outdir is None):
            resfile = os.path.join(outdir,chrom+".bed")
            outf = open(resfile,'w')
        fut[1].wait()
        coverout = track(fut[0],format='text',fields=['chr','start','end','name','c1','c2','c3','c4'])
        for s in sorted_stream(coverout.read(),[chrom]):
            s_split = s[3].split('|')
            infos = '|'.join(s_split[0:(len(s_split)-4)]+list(s[4:8]))
            outf.write('\t'.join([str(x) for x in s[0:3]+(infos,)])+'\n')
        if not(outdir is None):
            outf.close()
    if outdir is None: outf.close()
    else: resfile = outdir
    return resfile
Exemplo n.º 18
0
def camelPeaks( scores_fwd, scores_rev, peaks, chromosome_name, chromosome_length,
                read_extension, script_path ):
    """Runs the 'camelPeaks.py' wrapper script on the
    'scores_fwd', 'scores_rev' and 'peaks'
    input with parameters 'chromosome_name' (name of chromosome to process)
    and 'read_extension', using functions from 'script_path'/deconv_fcts.R.
    Returns a pdf file and several data tracks.
    """
    output = unique_filename_in()
    args = ["-p",peaks,"-f",scores_fwd,"-r",scores_rev,"-o",output,"-c",chromosome_name,
            "-l",str(chromosome_length),"-e",str(read_extension),"-z",script_path,"-s","1500"]
    return {'arguments': ["camelPeaks.py"]+args, 'return_value': None}
Exemplo n.º 19
0
def removeNA(fileToClean):
    ''' remove NA present in the 4th column of a file '''
    ''' mainly used with bedgraph'''
    fileNoNA = unique_filename_in()
    resfile = open(fileNoNA, 'w')
    with open(fileToClean) as f:
        for s in f:
            if s[0:5] == 'track': resfile.write(s)
            if s[0:5] != 'track' and s.strip().split('\t')[3] != "NA":
                resfile.write(s)
    resfile.close()
    return fileNoNA
Exemplo n.º 20
0
def gtf_from_bam_header(bam):
    """In case of alignment on a custom sequence."""
    bamtrack = track(bam,format='bam')
    gtf = unique_filename_in()+'.gtf'
    with open(gtf,"wb") as g:
        for c,meta in bamtrack.chrmeta.iteritems():
            n = c.split("|")[1] if "|" in c else c
            gtfline = '\t'.join([c,'','exon','1',str(meta['length']),'.','.','.',
                    'exon_id "%s"; transcript_id "%s"; gene_id "%s"; gene_name "%s"' % (c,c,c,n)])+'\n'
            g.write(gtfline)
    bamtrack.close()
    return gtf
Exemplo n.º 21
0
def motif_scan(ex, bedlist, assembly, groups, via, logfile):
    logfile.write("Scanning motifs\n")
    logfile.flush()
    motifbeds = {}
    supdir = os.path.split(ex.remote_working_directory)[0]
    for gid, bedfile in bedlist.iteritems():
        logfile.write("\n%i: " % gid)
        logfile.flush()
        group = groups[gid]
        motifs = {}
        for mot in group.get('motif', []):
            if os.path.exists(mot):
                mname = os.path.basename(os.path.splitext(mot)[0])
                motifs[mname] = mot
            elif os.path.exists(os.path.join(supdir, mot)):
                mname = os.path.basename(os.path.splitext(mot)[0])
                motifs[mname] = os.path.join(supdir, mot)
            else:
                _gnid, mname = mot.split(' ')
                motifs[mname] = _gnrp.get_motif_PWM(
                    int(_gnid), mname, output=unique_filename_in())
            logfile.write(mname + ", ")
            logfile.flush()
        _descr = set_file_descr(group['name'] + '_motifs.bed',
                                type='bed',
                                ucsc='1',
                                step='motifs',
                                groupId=gid)
        _out = unique_filename_in()
        _hd = "track name='%s_motifs'" % group['name']
        motifbeds[gid] = save_motif_profile(ex,
                                            motifs,
                                            assembly,
                                            bedfile,
                                            keep_max_only=True,
                                            output=_out,
                                            header=_hd,
                                            description=_descr,
                                            via=via)
    return motifbeds
Exemplo n.º 22
0
def parse_meme_xml( ex, meme_file, chrmeta ):
    """ Parse meme xml file and convert to track """
    from xml.etree import ElementTree as ET
    touch(ex,meme_file)
    tree = ET.parse(meme_file)
    ncol = {}
    allmatrices = {}
    for motif in tree.find('motifs').findall('motif'):
        mid = motif.attrib['id']
        ncol[mid] = 0
        allmatrices[mid] = unique_filename_in()
        with open(allmatrices[mid],'w') as mat_out:
            for parray in motif.find('probabilities')[0].findall('alphabet_array'):
                ncol[mid] += 1
                m = {'letter_A':0,'letter_C':0,'letter_G':0,'letter_T':0}
                for col in parray:
                    m[col.attrib['letter_id']] = float(col.text)
                mat_out.write("1\t%f\t%f\t%f\t%f\n" %(m['letter_A'],m['letter_C'],m['letter_G'],m['letter_T']))
    def _xmltree(_t):#(_c,_t):
        seq_name = {}
        seq_chr = None
        for it in _t.getiterator():
            if it.tag == 'sequence':
                seq_name[it.attrib['id']] = it.attrib['name']
            if it.tag == 'scanned_sites':
                name = seq_name[it.attrib['sequence_id']]
                name,seq_chr,start,end = re.search(r'(.*)\|(.+):(\d+)-(\d+)',name).groups()
            if it.tag == 'scanned_site':# and _c == seq_chr:
                start = int(start)+int(it.attrib['position'])-1
                end = start+ncol[it.attrib['motif_id']]
                strnd = it.attrib['strand'] == 'plus' and 1 or -1
                score = it.attrib['pvalue']
                yield (seq_chr,str(start),str(end),it.attrib['motif_id'],score,strnd)
    outsql = unique_filename_in()+".sql"
    outtrack = track(outsql, chrmeta=chrmeta, info={'datatype':'qualitative'},
                     fields=['start','end','name','score','strand'])
    outtrack.write(FeatureStream(_xmltree(tree),fields=['chr']+outtrack.fields))
    outtrack.close()
    return {'sql':outsql,'matrices':allmatrices}
Exemplo n.º 23
0
def transcriptome_gtf_from_genrep(assembly):
    """In case of mapping on the transcriptome - it if still ever happens."""
    tmap = assembly.get_transcript_mapping()
    gtf = unique_filename_in()
    gtflines = []
    smap = {1: '+', -1: '-'}
    with open(gtf, "wb") as g:
        for tid, t in tmap.iteritems():
            gtfline = [t.id,'Ensembl','exon',1,t.length,'.','+','.','gene_id "%s"; gene_name "%s"; gene_locus "%s:%i-%i"'\
                       % (t.gene_id,t.gene_name,t.chrom,t.start,t.end)]
            g.write('\t'.join([str(x) for x in gtfline]) + '\n')
    del tmap
    return gtf
Exemplo n.º 24
0
def transcriptome_gtf_from_genrep(assembly):
    """In case of mapping on the transcriptome - it if still ever happens."""
    tmap = assembly.get_transcript_mapping()
    gtf = unique_filename_in()
    gtflines = []
    smap = {1:'+', -1:'-'}
    with open(gtf,"wb") as g:
        for tid,t in tmap.iteritems():
            gtfline = [t.id,'Ensembl','exon',1,t.length,'.','+','.','gene_id "%s"; gene_name "%s"; gene_locus "%s:%i-%i"'\
                       % (t.gene_id,t.gene_name,t.chrom,t.start,t.end)]
            g.write('\t'.join([str(x) for x in gtfline])+'\n')
    del tmap
    return gtf
Exemplo n.º 25
0
 def differential_analysis(counts_file, feature_type):
     #shutil.copy(counts_file, "../")
     diff_files = DE.differential_analysis(counts_file)
     if diff_files is not None:
         for diff in diff_files:
             # Remove first line
             diff_nohead = unique_filename_in()
             with open(diff) as f:
                 head = f.readline().strip()
                 with open(diff_nohead, "wb") as g:
                     for line in f: g.write(line)
             oname = feature_type + "_differential_"+ head + ".txt"
             desc = set_file_descr(oname, step='stats', type='txt', ucsc=0)
             ex.add(diff_nohead, description=desc)
Exemplo n.º 26
0
def combine_counts(counts,
                   idsColsKey,
                   idsColsCounts,
                   output="combined_counts.txt"):
    if output in [None, '']: output = unique_filename_in()
    all_counts = {}
    infos = {}
    leninfos = 0
    if not isinstance(idsColsKey, (list, tuple)): idsColsKey = [idsColsKey]
    if not isinstance(idsColsCounts, (list, tuple)):
        idsColsCounts = [idsColsCounts]

    for i, filename in enumerate(counts):
        with open(filename) as f:
            s = f.next().strip('\n').replace("[", "").replace("]",
                                                              "").split("\t")
            if i == 0:  #1st file: initialization of counts and infos
                _colinfos = [
                    ss for n, ss in enumerate(s)
                    if n not in idsColsKey + idsColsCounts
                ]
                leninfos = len(_colinfos)
                h_infos = '\t'.join(_colinfos)
                h_counts = '\t'.join([s[n] for n in idsColsCounts])
                h_key = '\t'.join([s[n] for n in idsColsKey])
            else:
                h_counts += '\t'.join([''] + [s[n] for n in idsColsCounts])
            for line in f:
                s = line.strip('\n').replace("[", "").replace("]",
                                                              "").split("\t")
                curKey = '\t'.join([s[n] for n in idsColsKey])
                if i == 0:  #1st file: initialization of counts and infos
                    all_counts[curKey] = [''] * len(counts)
                    curInfo = [
                        ss for n, ss in enumerate(s)
                        if n not in idsColsKey + idsColsCounts
                    ]
                    if len(curInfo) < leninfos:
                        curInfo.extend([''] * (leninfos - len(curInfo)))
                    infos[curKey] = '\t'.join(curInfo)
                all_counts[curKey][i] = '\t'.join(
                    [s[n] for n in idsColsCounts])

    with open(output, 'w') as out:
        out.write(h_key + '\t' + h_counts + '\t' + h_infos + '\n')
        for k, v in all_counts.iteritems():
            out.write(k + '\t' + '\t'.join(str(s) for s in all_counts[k]) +
                      '\t' + infos.get(k, '') + '\n')

    return (output)
Exemplo n.º 27
0
 def differential_analysis(counts_file, feature_type):
     #shutil.copy(counts_file, "../")
     diff_files = DE.differential_analysis(counts_file)
     if diff_files is not None:
         for diff in diff_files:
             # Remove first line
             diff_nohead = unique_filename_in()
             with open(diff) as f:
                 head = f.readline().strip()
                 with open(diff_nohead, "wb") as g:
                     for line in f:
                         g.write(line)
             oname = feature_type + "_differential_" + head + ".txt"
             desc = set_file_descr(oname, step='stats', type='txt', ucsc=0)
             ex.add(diff_nohead, description=desc)
Exemplo n.º 28
0
def camelPeaks(scores_fwd, scores_rev, peaks, chromosome_name,
               chromosome_length, read_extension, script_path):
    """Runs the 'camelPeaks.py' wrapper script on the
    'scores_fwd', 'scores_rev' and 'peaks'
    input with parameters 'chromosome_name' (name of chromosome to process)
    and 'read_extension', using functions from 'script_path'/deconv_fcts.R.
    Returns a pdf file and several data tracks.
    """
    output = unique_filename_in()
    args = [
        "-p", peaks, "-f", scores_fwd, "-r", scores_rev, "-o", output, "-c",
        chromosome_name, "-l",
        str(chromosome_length), "-e",
        str(read_extension), "-z", script_path, "-s", "1500"
    ]
    return {'arguments': ["camelPeaks.py"] + args, 'return_value': None}
Exemplo n.º 29
0
def gtf_from_bam_header(bam):
    """In case of alignment on a custom sequence."""
    bamtrack = track(bam, format='bam')
    gtf = unique_filename_in() + '.gtf'
    with open(gtf, "wb") as g:
        for c, meta in bamtrack.chrmeta.iteritems():
            n = c.split("|")[1] if "|" in c else c
            gtfline = '\t'.join([
                c, '', 'exon', '1',
                str(meta['length']), '.', '.', '.',
                'exon_id "%s"; transcript_id "%s"; gene_id "%s"; gene_name "%s"'
                % (c, c, c, n)
            ]) + '\n'
            g.write(gtfline)
    bamtrack.close()
    return gtf
Exemplo n.º 30
0
def save_wellington( ex, wellout, chrmeta ):
    bedlist = {}
    for name, wlist in wellout.iteritems():
        wellall = unique_filename_in()
#### Dummy file
        touch( ex, wellall )
        ex.add(wellall,
               description=set_file_descr(name[1]+'_wellington_files', type='none', view='admin',
                                          step='footprints', groupId=name[0]))
#### BED at FDR 1%
        bedlist[name[0]] = wellall+"FDR01.bed.gz"
        bedzip = gzip.open(bedlist[name[0]],'wb')
        bedzip.write("track name='"+name[1]+"_WellingtonFootprints_FDR_0.01'\n")
        for x in wlist:
            with open(os.path.join(*x)+".WellingtonFootprints.FDR.0.01.bed") as _bed:
                [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall+"FDR01.bed.gz",
               description=set_file_descr(name[1]+'_WellingtonFootprintsFDR01.bed.gz',
                                          type='bed', ucsc='1', step='footprints', groupId=name[0]),
               associate_to_filename=wellall, template='%s_WellingtonFootprintsFDR01.bed.gz')
#### BED at p-values [...]
        bedzip = gzip.open(wellall+"PvalCutoffs.bed.gz",'wb')
        for bfile in os.listdir(os.path.join(wlist[0][0],"p_value_cutoffs")):
            cut = os.path.splitext(bfile[:-4])[1][1:] #between . ([1:]) and .bed ([:-4])
            bedzip.write("track name='"+name[1]+"_WellingtonFootprints_Pval_%s'\n" %cut)
            for wdir,wpref in wlist:
                _bedpath = os.path.join(wdir,"p_value_cutoffs",wpref+".WellingtonFootprints."+cut+".bed")
                with open(_bedpath) as _bed:
                    [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall+"PvalCutoffs.bed.gz",
               description=set_file_descr(name[1]+'_WellingtonFootprintsPvalCutoffs.bed.gz',
                                          type='bed', ucsc='1', step='footprints', groupId=name[0]),
               associate_to_filename=wellall, template='%s_WellingtonFootprintsPvalCutoffs.bed.gz')
#### WIG
        cat([os.path.join(*x)+".WellingtonFootprints.wig" for x in wlist], wellall+".wig")
        #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta)
        #ex.add(wellall+".bw",
        #       description=set_file_descr(name[1]+'_WellingtonFootprints.bw',
        #                                  type='bigWig', ucsc='1', step='footprints', groupId=name[0]),
        #       associate_to_filename=wellall, template='%s_WellingtonFootprints.bw')
        ex.add(wellall+".wig",
               description=set_file_descr(name[1]+'_WellingtonFootprints.wig',
                                          type='wig', ucsc='1', step='footprints', groupId=name[0]),
               associate_to_filename=wellall, template='%s_WellingtonFootprints.wig')
    return bedlist
Exemplo n.º 31
0
def macs( read_length, genome_size, bamfile, ctrlbam=None, args=None ):
    """Binding for the ``macs`` peak caller `<http://liulab.dfci.harvard.edu/MACS/>`_.
    Takes one (optionally two) bam file(s) and the 'read_length' and 'genome_size' parameters passed to ``macs``.
    Returns the file prefix ('-n' option of ``macs``)
    """
    macs_args = ["macs14","-t",bamfile,"-f","BAM","-g",str(genome_size)]
    if isinstance(args,list): macs_args += args
    if not(ctrlbam is None): macs_args += ["-c",ctrlbam]
    if "-n" in macs_args:
        outname = macs_args[macs_args.index("-n")+1]
    else:
        outname = unique_filename_in()
        macs_args += ["-n",outname]
    if read_length>0 and "-s" not in macs_args: macs_args += ["-s",str(read_length)]
    if not("--verbose" in macs_args): macs_args += ["--verbose","1"]
    if not("--keep-dup" in macs_args): macs_args += ["--keep-dup","all"]
    return {"arguments": macs_args, "return_value": outname}
Exemplo n.º 32
0
def macs(read_length, genome_size, bamfile, ctrlbam=None, args=None):
    """Binding for the ``macs`` peak caller `<http://liulab.dfci.harvard.edu/MACS/>`_.
    Takes one (optionally two) bam file(s) and the 'read_length' and 'genome_size' parameters passed to ``macs``.
    Returns the file prefix ('-n' option of ``macs``)
    """
    macs_args = ["macs14", "-t", bamfile, "-f", "BAM", "-g", str(genome_size)]
    if isinstance(args, list): macs_args += args
    if not (ctrlbam is None): macs_args += ["-c", ctrlbam]
    if "-n" in macs_args:
        outname = macs_args[macs_args.index("-n") + 1]
    else:
        outname = unique_filename_in()
        macs_args += ["-n", outname]
    if read_length > 0 and "-s" not in macs_args:
        macs_args += ["-s", str(read_length)]
    if not ("--verbose" in macs_args): macs_args += ["--verbose", "1"]
    if not ("--keep-dup" in macs_args): macs_args += ["--keep-dup", "all"]
    return {"arguments": macs_args, "return_value": outname}
Exemplo n.º 33
0
def createLibrary(ex, assembly_or_fasta, params, url=GlobalHtsUrl, via='local'):
    """
    Main call to create the library
    """
    if len(params['primary'])<2:
        print('Some parameters are missing, cannot create the library')
        print('primary='+params['primary']+" ; "+'secondary='+params['secondary'])
        return [None,None,None,None]

    if not isinstance(assembly_or_fasta,genrep.Assembly):
        assembly_or_fasta = genrep.Assembly( ex=ex, fasta=assembly_or_fasta )
    chrnames = assembly_or_fasta.chrnames
    chrom_map = dict((v['ac'],k) for k,v in assembly_or_fasta.chrmeta.iteritems())
    allfiles = assembly_or_fasta.fasta_by_chrom  #assembly_or_fasta.untar_genome_fasta()

    libfiles = dict((c, getRestEnzymeOccAndSeq.nonblocking( ex, f,
                                                            params['primary'], params['secondary'],
                                                            params['length'],  params['type'],
                                                            via=via ))
                    for c, f in allfiles.iteritems())
    resfile = unique_filename_in()
    os.mkdir(resfile)
    bedfiles = {}
    for chrom, future in libfiles.iteritems():
        libfiles[chrom] = future.wait()
        if not os.path.getsize(libfiles[chrom][1])>0:
            time.sleep(60)
            touch(ex,libfiles[chrom][1])
        bedfiles[chrom] = parse_fragFile(libfiles[chrom][1],chrom_map)
    rescov = coverageInRepeats(ex, bedfiles, params['species'], outdir=resfile, via=via)
    bedchrom = [os.path.join(resfile,chrom+".bed") for chrom in chrnames]
    cat(bedchrom,out=resfile+".bed")
    gzipfile(ex,[resfile+".bed"]+bedchrom)
#    resfile_sql = resfile+".sql"
#    track.convert((resfile,'bed'),(resfile_sql,'sql'),assembly=params['species'])
    enz_list = []
    infos_lib = { 'assembly_name':  params['species'],
                  'enzyme1_id':     getEnzymeSeqId(params['primary'], True, enz_list, url),
                  'enzyme2_id':     getEnzymeSeqId(params['secondary'], True, enz_list, url),
                  'segment_length': params['length'],
                  'type':           params['type'],
                  'filename':       resfile }
    return [ libfiles, bedfiles, resfile, infos_lib ]
Exemplo n.º 34
0
    def convert_junc_file(self, filename):
        """Convert a .junc SOAPsplice output file to bed format. Return the file name.

        :param filename: (str) name of the .junc file to convert.
        """
        t = track(filename, format='txt', fields=['chr','start','end','strand','score'],
                  chrmeta=self.assembly.chrmeta)
        stream = t.read()
        # Translate chromosome names
        s1 = map_chromosomes(stream, self.assembly.chromosomes)
        # Add junction IDs
        s2 = duplicate(s1,'strand','name')
        C = itertools.count()
        s3 = apply(s2,'name', lambda x: 'junction'+str(C.next()))
        # Convert to bed format
        outfile = unique_filename_in()
        bed = outfile + '.bed'
        out = track(bed, fields=s3.fields, chrmeta=self.assembly.chrmeta)
        out.write(s3)
        return bed
Exemplo n.º 35
0
        def soapsplice(unmapped_R1,
                       unmapped_R2,
                       index,
                       output=None,
                       path_to_soapsplice=None,
                       options={}):
            """Bind 'soapsplice'. Return a text file containing the list of junctions.

            :param unmapped_R1: (str) path to the fastq file containing the 'left' reads.
            :param unmapped_R2: (str) path to the fastq file containing the 'right' reads.
            :param index: (str) path to the SOAPsplice index.
            :param output: (str) output file name.
            :param path_to_soapsplice: (str) path to the SOAPsplice executable.
                If not specified, the program must be in your $PATH.
            :param options: (dict) SOAPsplice options, given as {opt: value}.
            :rtype: str

            Main options::

            -p: number of threads, <= 20. [1]
            -S: 1: forward strand, 2: reverse strand, 3: both. [3]
            -m: maximum mismatch for one-segment alignment, <= 5. [3]
            -g: maximum indel for one-segment alignment, <= 2. [2]
            -i: length of tail that can be ignored in one-segment alignment. [7]
            -t: longest gap between two segments in two-segment alignment. [500000]
            -a: shortest length of a segment in two-segment alignment. [8]
            -q: input quality type in FASTQ file (0: old Illumina, 1: Sanger). [0]
            -L: maximum distance between paired-end reads. [500000]
            -l: minimum distance between paired-end reads. [50]
            -I: insert length of paired-end reads.
            """
            if not output: output = unique_filename_in()
            path_to_soapsplice = path_to_soapsplice or 'soapsplice'
            args = [
                path_to_soapsplice, '-d', index, '-1', unmapped_R1, '-2',
                unmapped_R2, '-o', output, '-f', '2'
            ]
            opts = []
            for k, v in options.iteritems():
                opts.extend([str(k), str(v)])
            return {"arguments": args + opts, "return_value": output}
Exemplo n.º 36
0
def bam_to_annot_counts(bamfiles, annotations_file, pref_name='', output=None):
    '''
    Scan each bam file of a list and calculate the corrected counts for each annotation key
    present in the "annotations_file".
    '''
    if output is None: output = unique_filename_in()
    map = {}
    counts = {}
    with open(annotations_file) as f:
        header = f.next().strip('\n').split("\t")
        for line in f:
            s = line.strip('\n').split("\t")
            k = s.pop(0)
            map[k] = s
            counts[k] = 0

    tot = 0
    for bamfile in bamfiles:
        infile = pysam.Samfile(bamfile)
        for read in infile:
            nh = dict(read.tags).get('NH', 1)
            if isinstance(nh, basestring): nh = 1
            if nh < 1: continue
            inh = 1.0 / nh
            rname = infile.getrname(read.rname).split("|")[0]
            if rname in counts:
                counts[rname] += inh
## still increment if not in counts?
            tot += inh
        infile.close()

    with open(output, 'w') as out:
        out.write('\t'.join(
            [header[0], 'counts_' + pref_name, '%counts_' + pref_name] +
            header[1:]) + '\n')
        for k, v in map.iteritems():
            pc = 100 * counts[k] / tot
            out.write('\t'.join([k, "%.2f" %
                                 counts[k], "%.3f" % pc] + map[k]) + '\n')

    return output
Exemplo n.º 37
0
def bam_to_annot_counts(bamfiles, annotations_file, pref_name="", output=None):
    """
    Scan each bam file of a list and calculate the corrected counts for each annotation key
    present in the "annotations_file".
    """
    if output is None:
        output = unique_filename_in()
    map = {}
    counts = {}
    with open(annotations_file) as f:
        header = f.next().strip("\n").split("\t")
        for line in f:
            s = line.strip("\n").split("\t")
            k = s.pop(0)
            map[k] = s
            counts[k] = 0

    tot = 0
    for bamfile in bamfiles:
        infile = pysam.Samfile(bamfile)
        for read in infile:
            nh = dict(read.tags).get("NH", 1)
            if isinstance(nh, basestring):
                nh = 1
            if nh < 1:
                continue
            inh = 1.0 / nh
            rname = infile.getrname(read.rname).split("|")[0]
            if rname in counts:
                counts[rname] += inh
            ## still increment if not in counts?
            tot += inh
        infile.close()

    with open(output, "w") as out:
        out.write("\t".join([header[0], "counts_" + pref_name, "%counts_" + pref_name] + header[1:]) + "\n")
        for k, v in map.iteritems():
            pc = 100 * counts[k] / tot
            out.write("\t".join([k, "%.2f" % counts[k], "%.3f" % pc] + map[k]) + "\n")

    return output
Exemplo n.º 38
0
def getCountsPerLevel(infile, level=None, output=None):
    if output is None: output = unique_filename_in()
    counts = {}
    map = {}
    tot = 0
    idColCounts = 1
    name = ''
    with open(infile) as f:
        header = f.next().strip('\n').split('\t')
        try:
            level_idx = header.index(level)
        except:
            raise ValueError("No column corresponds to " + level +
                             " in file " + infile)

        level_top = header.index('Kingdom')
        colrange = range(level_idx, level_top,
                         2 * int(level_top > level_idx) - 1)
        header_out = [header[n] for n in colrange]
        name = header[idColCounts]
        for line in f:
            s = line.strip('\n').split('\t')
            if len(s) < len(header): s.extend([''] * (len(header) - len(s)))
            tot += float(s[idColCounts])
            counts[s[level_idx]] = counts.get(s[level_idx], 0.0) + float(
                s[idColCounts])
            map[s[level_idx]] = [s[n] for n in colrange]

    with open(output, 'w') as out:
        header = [level] + header_out + ["counts_" + name, "%counts_" + name]
        out.write("\t".join(header) + "\n")
        for k, v in map.iteritems():
            pc = 100 * counts[k] / tot
            curk = k or 'Unnanotated'
            out.write(
                "\t".join([curk] + v +
                          ["%.2f" % counts[k], "%.3f" % pc]) + "\n")

    return output
Exemplo n.º 39
0
    def convert_junc_file(self, filename):
        """Convert a .junc SOAPsplice output file to bed format. Return the file name.

        :param filename: (str) name of the .junc file to convert.
        """
        t = track(filename,
                  format='txt',
                  fields=['chr', 'start', 'end', 'strand', 'score'],
                  chrmeta=self.assembly.chrmeta)
        stream = t.read()
        # Translate chromosome names
        s1 = map_chromosomes(stream, self.assembly.chromosomes)
        # Add junction IDs
        s2 = duplicate(s1, 'strand', 'name')
        C = itertools.count()
        s3 = apply(s2, 'name', lambda x: 'junction' + str(C.next()))
        # Convert to bed format
        outfile = unique_filename_in()
        bed = outfile + '.bed'
        out = track(bed, fields=s3.fields, chrmeta=self.assembly.chrmeta)
        out.write(s3)
        return bed
Exemplo n.º 40
0
def combine_counts(counts, idsColsKey, idsColsCounts, output="combined_counts.txt"):
    if output in [None, ""]:
        output = unique_filename_in()
    all_counts = {}
    infos = {}
    leninfos = 0
    if not isinstance(idsColsKey, (list, tuple)):
        idsColsKey = [idsColsKey]
    if not isinstance(idsColsCounts, (list, tuple)):
        idsColsCounts = [idsColsCounts]

    for i, filename in enumerate(counts):
        with open(filename) as f:
            s = f.next().strip("\n").replace("[", "").replace("]", "").split("\t")
            if i == 0:  # 1st file: initialization of counts and infos
                _colinfos = [ss for n, ss in enumerate(s) if n not in idsColsKey + idsColsCounts]
                leninfos = len(_colinfos)
                h_infos = "\t".join(_colinfos)
                h_counts = "\t".join([s[n] for n in idsColsCounts])
                h_key = "\t".join([s[n] for n in idsColsKey])
            else:
                h_counts += "\t".join([""] + [s[n] for n in idsColsCounts])
            for line in f:
                s = line.strip("\n").replace("[", "").replace("]", "").split("\t")
                curKey = "\t".join([s[n] for n in idsColsKey])
                if i == 0:  # 1st file: initialization of counts and infos
                    all_counts[curKey] = [""] * len(counts)
                    curInfo = [ss for n, ss in enumerate(s) if n not in idsColsKey + idsColsCounts]
                    if len(curInfo) < leninfos:
                        curInfo.extend([""] * (leninfos - len(curInfo)))
                    infos[curKey] = "\t".join(curInfo)
                all_counts[curKey][i] = "\t".join([s[n] for n in idsColsCounts])

    with open(output, "w") as out:
        out.write(h_key + "\t" + h_counts + "\t" + h_infos + "\n")
        for k, v in all_counts.iteritems():
            out.write(k + "\t" + "\t".join(str(s) for s in all_counts[k]) + "\t" + infos.get(k, "") + "\n")

    return output
Exemplo n.º 41
0
def main():
    parser = None
    try:
        parser = optparse.OptionParser(usage=usage, description=description)
        for opt in opts:
            if len(opt) == 4:
                parser.add_option(opt[0], opt[1], help=opt[2], **opt[3])
            elif len(opt) == 3:
                parser.add_option(opt[0], help=opt[1], **opt[2])
        (opt, args) = parser.parse_args()

        if not (opt.input and os.path.exists(opt.input)):
            raise Usage("Please provide a fastq file")

        if opt.debug:
            print("""
fastqToFasta.py
i=%s
n=%i
x=%i
""" % (opt.input, n, x))

        fq = pysam.FastqFile(opt.input)
        faFile = opt.output or unique_filename_in()
        rlen = int(opt.length)
        rskip = int(opt.start) - 1
        fa = open(faFile, "w")
        for i, s in enumerate(fq):
            seq = s.sequence[rskip:(rskip + rlen)]
            header = "_".join([s.name, s.sequence, s.quality])
            fa.write(">" + header + "\n" + seq + "\n")
        fq.close()
        fa.close()

    except Usage, err:
        print >> sys.stderr, '\n', err.msg, '\n'
        if parser: parser.print_help()
        return 1
Exemplo n.º 42
0
def main():
    parser = None
    try:
        parser = optparse.OptionParser(usage=usage, description=description)
        for opt in opts:
            if len(opt) == 4:
                parser.add_option(opt[0],opt[1],help=opt[2],**opt[3])
            elif len(opt) == 3:
                parser.add_option(opt[0],help=opt[1],**opt[2])
        (opt, args) = parser.parse_args()

        if not(opt.input and os.path.exists(opt.input)):
            raise Usage("Please provide a fastq file")

        if opt.debug:
            print("""
fastqToFasta.py
i=%s
n=%i
x=%i
""" %(opt.input,n,x))

        fq = pysam.FastqFile(opt.input)
        faFile = opt.output or unique_filename_in()
        rlen = int(opt.length)
        rskip = int(opt.start)-1
        fa = open(faFile,"w")    
        for i,s in enumerate(fq):
            seq = s.sequence[rskip:(rskip+rlen)]
            header = "_".join([s.name,s.sequence,s.quality])
            fa.write(">"+header+"\n"+seq+"\n")
        fq.close()
        fa.close()

    except Usage, err:
        print >>sys.stderr, '\n',err.msg,'\n'
        if parser: parser.print_help()
        return 1
Exemplo n.º 43
0
def _begin(output,format,new,ratio=1.375,**kwargs):
    """Initializes the plot in *R*."""
    if new:
        if output is None:
            output = unique_filename_in()
        if format == 'pdf':
            robjects.r('pdf("%s",paper="a4",height=8*%f,width=8)' %(output,ratio))
        elif format == 'png':
            robjects.r('png("%s",height=800*%f,width=800,type="cairo")' %(output,ratio))
        else:
            raise ValueError("Format not supported: %s" %format)
        pars = "lwd=2,cex=1.1,cex.main=1.5,cex.lab=1.3,cex.axis=1.1,mar=c(4,4,1,1),las=1,pch=20"
        if len(kwargs.get('mfrow',[])) == 2:
            pars += ",mfrow=c(%i,%i)" %tuple(kwargs['mfrow'])
        robjects.r('par(%s)' %pars)
    opts = ''
    if 'log' in kwargs: opts += ',log="%s"' %kwargs['log']
    if 'xlim' in kwargs: opts += ',xlim=c(%f,%f)' %tuple(kwargs['xlim'])
    if 'ylim' in kwargs: opts += ',ylim=c(%f,%f)' %tuple(kwargs['ylim'])
    opts += ',main="%s"' %kwargs.get('main','')
    opts += ',xlab="%s"' %kwargs.get('xlab','')
    opts += ',ylab="%s"' %kwargs.get('ylab','')
    return opts, output
Exemplo n.º 44
0
def create_tracks(ex, outall, sample_names, assembly):
    """Write BED tracks showing SNPs found in each sample."""
    infields = ['chromosome', 'position', 'reference'
                ] + sample_names + ['gene', 'location_type', 'distance']
    intrack = track(outall,
                    format='text',
                    fields=infields,
                    chrmeta=assembly.chrmeta,
                    intypes={'position': int})
    instream = intrack.read(fields=infields[:-3])
    outtracks = {}
    for sample_name in sample_names:
        out = unique_filename_in() + '.bed.gz'
        t = track(out, fields=['name'])
        t.make_header(name=sample_name + "_SNPs")
        outtracks[sample_name] = (t, out)

    def _row_to_annot(x, ref, n):
        if x[3 + n][0] == ref: return None
        else: return "%s>%s" % (ref, x[3 + n][0])

    for x in instream:
        coord = (x[0], x[1] - 1, x[1])
        ref = x[2]
        snp = dict((name, _row_to_annot(x, ref, n))
                   for n, name in enumerate(sample_names))
        for name, tr in outtracks.iteritems():
            if snp[name]: tr[0].write([coord + (snp[name], )], mode='append')
    for name, tr in outtracks.iteritems():
        tr[0].close()
        description = set_file_descr(name + "_SNPs.bed.gz",
                                     type='bed',
                                     step='tracks',
                                     gdv='1',
                                     ucsc='1')
        ex.add(tr[1], description=description)
Exemplo n.º 45
0
def getCountsPerLevel(infile, level=None, output=None):
    if output is None:
        output = unique_filename_in()
    counts = {}
    map = {}
    tot = 0
    idColCounts = 1
    name = ""
    with open(infile) as f:
        header = f.next().strip("\n").split("\t")
        try:
            level_idx = header.index(level)
        except:
            raise ValueError("No column corresponds to " + level + " in file " + infile)

        level_top = header.index("Kingdom")
        colrange = range(level_idx, level_top, 2 * int(level_top > level_idx) - 1)
        header_out = [header[n] for n in colrange]
        name = header[idColCounts]
        for line in f:
            s = line.strip("\n").split("\t")
            if len(s) < len(header):
                s.extend([""] * (len(header) - len(s)))
            tot += float(s[idColCounts])
            counts[s[level_idx]] = counts.get(s[level_idx], 0.0) + float(s[idColCounts])
            map[s[level_idx]] = [s[n] for n in colrange]

    with open(output, "w") as out:
        header = [level] + header_out + ["counts_" + name, "%counts_" + name]
        out.write("\t".join(header) + "\n")
        for k, v in map.iteritems():
            pc = 100 * counts[k] / tot
            curk = k or "Unnanotated"
            out.write("\t".join([curk] + v + ["%.2f" % counts[k], "%.3f" % pc]) + "\n")

    return output
Exemplo n.º 46
0
def dnaseseq_workflow(ex, job, assembly, logfile=sys.stdout, via='lsf'):
    """
    This workflow performs the following steps:

      * BAM files from replicates within the same group are merged
      * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group
      * Wellington is called to identify footprints within these enriched regions
      * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file
      * Average DNAse profiles around motifs are plotted

    """
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    supdir = os.path.split(ex.remote_working_directory)[0]
    for gid, mapped in job.files.iteritems():
        group_name = job.groups[gid]['name']
        if not isinstance(mapped, dict):
            raise TypeError(
                "Files values must be dictionaries with keys *run_ids* or 'bam'."
            )
        if 'bam' in mapped: mapped = {'_': mapped}
        if len(mapped) > 1:
            bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()])
            index = index_bam(ex, bamfile)
        else:
            bamfile = mapped.values()[0]['bam']
        if job.groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid, group_name))
        else:
            if os.path.exists(job.groups[gid].get('bedfile', 'null')):
                bedfile = job.groups[gid]['bedfile']
            elif os.path.exists(
                    os.path.join(supdir,
                                 job.groups[gid].get('bedfile', 'null'))):
                bedfile = os.path.join(supdir, job.groups[gid]['bedfile'])
            else:
                bedfile = None
            tests.append((bedfile, bamfile))
            names['tests'].append((gid, group_name))
    if len(controls) < 1:
        controls = [None]
        names['controls'] = [(0, None)]
    tests = macs_bedfiles(ex, assembly.chrmeta, tests, controls, names,
                          job.options.get('macs_args', ["--keep-dup", "10"]),
                          via, logfile)
    bedlist = run_wellington(ex, tests, names, assembly, via, logfile)
    ######################### Motif scanning / plotting
    if any([
            gr.get('motif') != 'null' and gr.get('motif')
            for gr in job.groups.values()
    ]):
        motifbeds = motif_scan(ex, bedlist, assembly, job.groups, via, logfile)
        siglist = dict((gid[0], []) for gid in names['tests'])
        for gid, mapped in job.files.iteritems():
            wig = []
            suffixes = ["fwd", "rev"]
            merge_strands = int(job.options.get('merge_strands', -1))
            read_extension = int(job.options.get('read_extension') or -1)
            make_wigs = merge_strands >= 0 or read_extension != 1
            for m in mapped.values():
                if make_wigs or not ('wig' in m) or len(m['wig']) < 2:
                    output = mapseq.parallel_density_sql(
                        ex,
                        m["bam"],
                        assembly.chrmeta,
                        nreads=m["stats"]["total"],
                        merge=-1,
                        read_extension=1,
                        convert=False,
                        b2w_args=[],
                        via=via)
                    wig.append(dict(
                        (s, output + s + '.sql') for s in suffixes))
                else:
                    wig.append(m['wig'])
            if len(wig) > 1:
                wig[0] = dict((s, merge_sql(ex, [x[s] for x in wig], via=via))
                              for s in suffixes)
            _trn = job.groups[gid]['name'] + "_%s"
            if job.groups[gid]['control']:
                for s, w in wig[0].iteritems():
                    for _g in siglist.keys():
                        siglist[_g].append(track(w, info={'name': _trn % s}))
            else:
                siglist[gid].extend([
                    track(w, info={'name': _trn % s})
                    for s, w in wig[0].iteritems()
                ])
        plot_files = plot_footprint_profile(ex, motifbeds, siglist,
                                            assembly.chrnames, job.groups,
                                            logfile)
        for gid, flist in plot_files.iteritems():
            gname = job.groups[gid]['name']
            plotall = unique_filename_in()
            touch(ex, plotall)
            ex.add(plotall,
                   description=set_file_descr(gname + '_footprints_plots',
                                              type='none',
                                              view='admin',
                                              step='motifs',
                                              groupId=gid))
            ex.add(flist['pdf'],
                   description=set_file_descr(gname + '_footprints_plots.pdf',
                                              type='pdf',
                                              step='motifs',
                                              groupId=gid),
                   associate_to_filename=plotall,
                   template='%s.pdf')
            tarname = unique_filename_in()
            tarfh = tarfile.open(tarname, "w:gz")
            for mname, matf in flist['mat']:
                tarfh.add(matf, arcname="%s_%s.txt" % (gname, mname))
            tarfh.close()
            ex.add(tarname,
                   description=set_file_descr(gname +
                                              '_footprints_plots.tar.gz',
                                              type='tar',
                                              step='motifs',
                                              groupId=gid),
                   associate_to_filename=plotall,
                   template='%s.tar.gz')
    logfile.write("\nDone.\n ")
    logfile.flush()
    return 0
Exemplo n.º 47
0
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local',
                 logfile=sys.stdout, debugfile=sys.stderr):
    """Main function of the workflow"""
    ref_genome = assembly.fasta_by_chrom
    sample_names = [job.groups[gid]['name'] for gid in sorted(job.files.keys())]

    logfile.write("\n* Generate vcfs for each chrom/group\n"); logfile.flush()
    vcfs = dict((chrom,{}) for chrom in ref_genome.keys()) # {chr: {}}
    bams = {}
    # Launch the jobs
    for gid in sorted(job.files.keys()):
        # Merge all bams belonging to the same group
        runs = [r['bam'] for r in job.files[gid].itervalues()]
        bam = Samfile(runs[0])
        header = bam.header
        headerfile = unique_filename_in()
        for h in header["SQ"]:
            if h["SN"] in assembly.chrmeta:
                h["SN"] = assembly.chrmeta[h["SN"]]["ac"]
        head = Samfile( headerfile, "wh", header=header )
        head.close()
        if len(runs) > 1:
            _b = merge_bam(ex,runs)
            index_bam(ex,_b)
            bams[gid] = _b
        else:
            bams[gid] = runs[0]
        # Samtools mpileup + bcftools + vcfutils.pl
        for chrom,ref in ref_genome.iteritems():
            vcf = unique_filename_in()
            vcfs[chrom][gid] = (vcf,
                                pileup.nonblocking(ex, bams[gid], ref, header=headerfile,
                                                   via=via, stdout=vcf))
        logfile.write("  ...Group %s running.\n" %job.groups[gid]['name']); logfile.flush()
    # Wait for vcfs to finish and store them in *vcfs[chrom][gid]*
    for gid in sorted(job.files.keys()):
        for chrom,ref in ref_genome.iteritems():
            vcfs[chrom][gid][1].wait()
            vcfs[chrom][gid] = vcfs[chrom][gid][0]
        logfile.write("  ...Group %s done.\n" %job.groups[gid]['name']); logfile.flush()
    # Targz the pileup files (vcf)
    tarname = unique_filename_in()
    tarfh = tarfile.open(tarname, "w:gz")
    for chrom,v in vcfs.iteritems():
        for gid,vcf in v.iteritems():
            tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'],chrom))
    tarfh.close()
    ex.add( tarname, description=set_file_descr("vcfs_files.tar.gz",step="pileup",type="tar",view='admin') )

    logfile.write("\n* Merge info from vcf files\n"); logfile.flush()
    outall = unique_filename_in()
    outexons = unique_filename_in()
    with open(outall,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \
                                 ['gene','location_type','distance'])+'\n')
    with open(outexons,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \
                                  + ['new_aa_'+s for s in sample_names])+'\n')
    msa_table = dict((s,'') for s in [assembly.name]+sample_names)
    for chrom,v in vcfs.iteritems():
        logfile.write("  > Chromosome '%s'\n" % chrom); logfile.flush()
    # Put together info from all vcf files
        logfile.write("  - All SNPs\n"); logfile.flush()
        allsnps = all_snps(ex,chrom,vcfs[chrom],bams,outall,assembly,
                           sample_names,mincov,float(minsnp),logfile,debugfile)
    # Annotate SNPs and check synonymy
        logfile.write("  - Exonic SNPs\n"); logfile.flush()
        exon_snps(chrom,outexons,allsnps,assembly,sample_names,ref_genome,logfile,debugfile)
        for snprow in allsnps:
            for n,k in enumerate([assembly.name]+sample_names):
                msa_table[k] += snprow[3+n][0]
    description = set_file_descr("allSNP.txt",step="SNPs",type="txt")
    ex.add(outall,description=description)
    description = set_file_descr("exonsSNP.txt",step="SNPs",type="txt")
    ex.add(outexons,description=description)
    msafile = unique_filename_in()
    with open(msafile,"w") as msa:
        msa.write(" %i %i\n"%(len(msa_table),len(msa_table.values()[0])))
        for name,seq in msa_table.iteritems():
            msa.write("%s\t%s\n" %(name,seq))
    msa_table = {}
    description = set_file_descr("SNPalignment.txt",step="SNPs",type="txt")
    ex.add(msafile,description=description)
    # Create UCSC bed tracks
    logfile.write("\n* Create tracks\n"); logfile.flush()
    create_tracks(ex,outall,sample_names,assembly)
    # Create quantitative tracks
    logfile.write("\n* Create heteroz. and quality tracks\n"); logfile.flush()

    def _process_pileup(pileups, seq, startpos, endpos):
        atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        vectors = ([],[],[])
        for pileupcolumn in pileups:
            position = pileupcolumn.pos
            if position < startpos: continue
            if position >= endpos: break
            coverage = pileupcolumn.n
            ref_symbol = seq[position-startpos]
            ref = atoi.get(ref_symbol, 4)
            symbols = [0,0,0,0,0]
            quality = 0
            for pileupread in pileupcolumn.pileups:
                symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1
                quality += ord(pileupread.alignment.qual[pileupread.qpos])-33
            quality = float(quality)/coverage
            info = heterozygosity(ref, symbols[0:4])
            if coverage > 0: vectors[0].append((position, position+1, coverage))
            if info > 0: vectors[1].append((position, position+1, info))
            if quality > 0: vectors[2].append((position, position+1, quality))
#            yield (position, position+1, coverage, info, quality)
        return vectors

    if job.options.get('make_bigwigs',False):
        _descr = {'groupId':0,'step':"tracks",'type':"bigWig",'ucsc':'1'}
        for gid,bamfile in bams.iteritems():
            _descr['groupId'] = gid
            bamtr = track(bamfile,format="bam")
            covname = unique_filename_in()+".bw"
            out_cov = track(covname, chrmeta=assembly.chrmeta)
            hetname = unique_filename_in()+".bw"
            out_het = track(hetname, chrmeta=assembly.chrmeta)
            qualname = unique_filename_in()+".bw"
            out_qual = track(qualname, chrmeta=assembly.chrmeta)
            for chrom, cinfo in assembly.chrmeta.iteritems():
                fasta = Fastafile(ref_genome[chrom])
                #process fasta and bam by 10Mb chunks
                for chunk in range(0,cinfo["length"],10**7):
                    fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk+10**7)
                    vecs = _process_pileup(bamtr.pileup(chrom, chunk, chunk+10**7), fastaseq, chunk, chunk+10**7)
                    out_cov.write(vecs[0], fields=['start','end','score'], chrom=chrom)
                    out_het.write(vecs[1], fields=['start','end','score'], chrom=chrom)
                    out_qual.write(vecs[2], fields=['start','end','score'], chrom=chrom)
            out_cov.close()
            out_het.close()
            out_qual.close()
            description = set_file_descr(job.groups[gid]['name']+"_coverage.bw",**_descr)
            ex.add(covname,description=description)
            description = set_file_descr(job.groups[gid]['name']+"_heterozygosity.bw",**_descr)
            ex.add(hetname,description=description)
            description = set_file_descr(job.groups[gid]['name']+"_quality.bw",**_descr)
            ex.add(qualname,description=description)

    return 0
Exemplo n.º 48
0
#!/usr/bin/env python

from bbcflib.common import unique_filename_in
import sys, getopt, os

opts = dict(getopt.getopt(sys.argv[1:], "i:o:n:x:", [])[0])

exportFile = opts['-i']
n = opts.get('-n') or 1
x = opts.get('-x') or 22

print("In fastqToFasta")
print("i=" + fqFile)
print("n=" + str(n))
print("x=" + str(n))

faFile = opts.get('-o') or unique_filename_in()
output = open(faFile, "w")
i = 1
n = int(n)
x = int(x)
with open(exportFile, "r") as f:
    for s in f:
        s = s.strip('\n').split('\t')
        output.write(">line" + str(i) + ":" + s[8] + ":" + s[9] + ":" + s[-1] +
                     s[8][(n - 1):(n + x - 1)] + "\n")
        i = i + 1
output.close()
Exemplo n.º 49
0


opts = dict(getopt.getopt(sys.argv[1:],"i:o:n:x:",[])[0])

fqFile=opts['-i']

n=opts.get('-n') or 1
x=opts.get('-x') or 22

print("In fastqToFasta")
print("i="+fqFile)
print("n="+str(n))
print("x="+str(n))

faFile=opts.get('-o') or unique_filename_in()
output=open(faFile,"w")
i=1; nextIsQual=0; nextIsSeq=0;
n=int(n);x=int(x)
read_length=getReadLength(fqFile)
print("readLength="+str(read_length))
with open(fqFile,"r") as f:
	for s in f:
        	s=s.strip('\n')
                i=i+1
                if re.search(r'^@',s) and nextIsSeq == 0: #to avoid situations where the quality starts with either "@" or "+"
                    nextIsSeq=1
                    continue
                if re.search(r'^\+',s) and nextIsQual == 0: #to avoid situations where the quality starts with either "@" or "+"
                    nextIsQual=1
                    nextIsSeq=0
Exemplo n.º 50
0
def _outfile(kw):
    return kw.pop('outfile', unique_filename_in())
Exemplo n.º 51
0
    def _fetch_symlink(self, link_name, to=None):
        """Fetch the data from a file in the LIMS into *to*.

        *link_name* is a (list of) URL to a .tar.gz file in the LIMS.  These
        .tar.gz files all contain only one file, which we write to
        *to*.  If *to* is omitted, then the data is written to a
        randomly named file in the current working directory.  If
        *to* is a directory, the data is written to a randomly named
        file in that directory.  Otherwise *to* is taken as the full
        path to the file to write to.

        ``_fetch_symlink`` returns the path to the output file,
        including its filename.
        """
        def _concat_all(target,llist):
            with open(target, 'w') as output_file:
                for link in llist:
                    try:
                        url = self._open_url(link)
                        tar = None
                        if re.sub('.gz[ip]*','',link).endswith(".tar"):
                            tar = tarfile.open(fileobj=url, mode='r|gz')
                            # Since the tar file contains exactly one file, calling
                            # ``next()`` on the tar gives us the file we want.  We cannot
                            # use ``getnames()[0]`` or similar methods, since they scan
                            # all the way through the file, and we cannot rewind on HTTP
                            # responses.
                            tar_filename = tar.next()
                            # extractfile returns a file-like object we can stream from.
                            input_file = tar.extractfile(tar_filename)
                        elif not(link.endswith(".gz")):
                            input_file = url
                        else:
                            input_file = gzip.GzipFile(fileobj=StringIO.StringIO(url.read()))
                        while True:
                            chunk = input_file.read(4096)
                            if chunk == '':
                                break
                            else:
                                output_file.write(chunk)
                        input_file.close()
                        if tar: tar.close()
                    except Exception as e:
                        raise Exception("Problem with file %s: %s"%(link,e))

        if to == None:
            target = unique_filename_in()
        elif os.path.isdir(to):
            target = os.path.join(to, unique_filename_in(to))
        else:
            target = to

        if isinstance(link_name,dict):
            linknext = ([],[])
            for k in sorted(link_name.keys()):
                if k[0] > 1: linknext[1].append(link_name[k])
                else: linknext[0].append(link_name[k])
            link_name = linknext
        if isinstance(link_name,str):  link_name = [link_name]
        if isinstance(link_name,list): link_name = (link_name,[])
        _concat_all(target,link_name[0])
        if len(link_name[1])>0:
            _concat_all(target+"_R2",link_name[1])
            return (target,target+"_R2")
        return target
Exemplo n.º 52
0
def run(**kwargs):
    """
    Wrapper function to execute any operation contained in this package, directly from
    file inputs. Arguments are:

    :param operation: (str) the name of the function to be called.
    :param output: (str) a filename or a directory to write the results into.
    :param assembly: (str) a genome assembly identifier if needed.
    :param chromosome: (str) a chromosome name if operation must be restricted to a single chromsome.
    :param ...: additional parameters passed to `operation`.

    Example::

        run(operation="score_by_feature",
            output="score_output.bed", chromosome="chr1",
            trackScores="density_file.sql", trackFeatures="genes.sql")
    """
    from bbcflib import genrep
    def _map(fct):
        for module in _module_list:
            __import__(_here+module)
            smod = sys.modules[_here+module]
            if hasattr(getattr(smod, module)(),fct): return module
        return None
    funct = kwargs.pop("operation",'None')
    module = _map(funct)
    if module is None:
        raise ValueError("No such operation %s." %funct)
    output = kwargs.pop("output","./") or "./"
    if os.path.isdir(output):
        output = os.path.join(output,unique_filename_in(output)+".sql")
        format = "sql"
    else:
        format = os.path.splitext(output)[1][1:] or "sql"
    if format in ['gz','gzip']:
        format = os.path.splitext(output.strip("."+format))[1][1:]+"."+format
    smod = sys.modules[_here+module]
    trackSet = {}
    for targ in getattr(smod, module)().loadable(funct):
        trackSet[targ] = [track(t) for t in kwargs[targ].split(",")]
    assembly = None
    if 'assembly' in kwargs:
        assembly = kwargs.pop('assembly')
    if assembly:
        chrmeta = genrep.Assembly(assembly).chrmeta
    else:
        chrmeta = trackSet[targ][0].chrmeta
        if 'chromosome' in kwargs:
            chrom = kwargs.pop('chromosome')
            chrmeta = {chrom: chrmeta.get(chrom,{})}
    chr = chrmeta.keys()[0]
    info = None
    if 'datatype' in kwargs: info = {'datatype': kwargs.pop('datatype')}
    files = None
    for targ in getattr(smod, module)().loadable(funct):
        kwargs[targ] = [t.read(selection=chr) for t in trackSet[targ]]
    funct_output = getattr(smod, funct)(**kwargs)
    if isinstance(funct_output,list):
        files = []
        for n,stream in enumerate(funct_output):
            outf = "%s_%i.%s" %(output.strip(format),n,format)
            files.append(outf)
            fields = stream.fields
            track(outf,chrmeta=chrmeta,fields=fields,
                  info=info).write(stream,chrom=chr)
        for chr in chrmeta.keys()[1:]:
            for targ in getattr(smod, module)().loadable(funct):
                kwargs[targ] = [t.read(selection=chr) for t in trackSet[targ]]
            funct_output = getattr(smod, funct)(**kwargs)
            for n,stream in enumerate(funct_output):
                track(files[n],chrmeta=chrmeta).write(stream,chrom=chr,mode='append')
    else:
        files = output
        fields = funct_output.fields
        track(files,chrmeta=chrmeta,fields=fields,
              info=info).write(funct_output,chrom=chr)
        for chr in chrmeta.keys()[1:]:
            for targ in getattr(smod, module)().loadable(funct):
                kwargs[targ] = [t.read(selection=chr) for t in trackSet[targ]]
            funct_output = getattr(smod, funct)(**kwargs)
            track(files,chrmeta=chrmeta).write(funct_output,chrom=chr,mode='append')
    return files
Exemplo n.º 53
0
def save_wellington(ex, wellout, chrmeta):
    bedlist = {}
    for name, wlist in wellout.iteritems():
        wellall = unique_filename_in()
        #### Dummy file
        touch(ex, wellall)
        ex.add(wellall,
               description=set_file_descr(name[1] + '_wellington_files',
                                          type='none',
                                          view='admin',
                                          step='footprints',
                                          groupId=name[0]))
        #### BED at FDR 1%
        bedlist[name[0]] = wellall + "FDR01.bed.gz"
        bedzip = gzip.open(bedlist[name[0]], 'wb')
        bedzip.write("track name='" + name[1] +
                     "_WellingtonFootprints_FDR_0.01'\n")
        for x in wlist:
            with open(os.path.join(*x) +
                      ".WellingtonFootprints.FDR.0.01.bed") as _bed:
                [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall + "FDR01.bed.gz",
               description=set_file_descr(name[1] +
                                          '_WellingtonFootprintsFDR01.bed.gz',
                                          type='bed',
                                          ucsc='1',
                                          step='footprints',
                                          groupId=name[0]),
               associate_to_filename=wellall,
               template='%s_WellingtonFootprintsFDR01.bed.gz')
        #### BED at p-values [...]
        bedzip = gzip.open(wellall + "PvalCutoffs.bed.gz", 'wb')
        for bfile in os.listdir(os.path.join(wlist[0][0], "p_value_cutoffs")):
            cut = os.path.splitext(
                bfile[:-4])[1][1:]  #between . ([1:]) and .bed ([:-4])
            bedzip.write("track name='" + name[1] +
                         "_WellingtonFootprints_Pval_%s'\n" % cut)
            for wdir, wpref in wlist:
                _bedpath = os.path.join(
                    wdir, "p_value_cutoffs",
                    wpref + ".WellingtonFootprints." + cut + ".bed")
                with open(_bedpath) as _bed:
                    [bedzip.write(l) for l in _bed]
        bedzip.close()
        ex.add(wellall + "PvalCutoffs.bed.gz",
               description=set_file_descr(
                   name[1] + '_WellingtonFootprintsPvalCutoffs.bed.gz',
                   type='bed',
                   ucsc='1',
                   step='footprints',
                   groupId=name[0]),
               associate_to_filename=wellall,
               template='%s_WellingtonFootprintsPvalCutoffs.bed.gz')
        #### WIG
        cat([os.path.join(*x) + ".WellingtonFootprints.wig" for x in wlist],
            wellall + ".wig")
        #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta)
        #ex.add(wellall+".bw",
        #       description=set_file_descr(name[1]+'_WellingtonFootprints.bw',
        #                                  type='bigWig', ucsc='1', step='footprints', groupId=name[0]),
        #       associate_to_filename=wellall, template='%s_WellingtonFootprints.bw')
        ex.add(wellall + ".wig",
               description=set_file_descr(name[1] +
                                          '_WellingtonFootprints.wig',
                                          type='wig',
                                          ucsc='1',
                                          step='footprints',
                                          groupId=name[0]),
               associate_to_filename=wellall,
               template='%s_WellingtonFootprints.wig')
    return bedlist
Exemplo n.º 54
0
def run_microbiome(options=[], output=None):
    if output is None:
        output = unique_filename_in()
    options = [",".join([str(x) for x in o]) if isinstance(o, (list, tuple)) else str(o) for o in options]
    return {"arguments": ["run_microbiome.py"] + options + [output], "return_value": output}