예제 #1
0
파일: dio.py 프로젝트: shouldsee/synotil
def bed__checkValid(bed, GSIZE, force=0):
    fname = None
    if not isinstance(bed, pd.DataFrame):
        fname = bed
        bed = sdio.extract_peak(bed)
    sizeDF = pyutil.readData(GSIZE, ext='tsv', header=None, guess_index=0)
    sizeDF.columns = ['chrom', 'length']
    bedDF = sizeDF.merge(bed)
    bedDF['valid'] = bedDF.eval('start > 0 and end <= length')
    if not force:
        assert bedDF.valid.all()
    else:
        resDF = bedDF.query('valid').drop(columns=['valid', 'length'])
        if fname is not None:
            ofname = '%s__valid.bed' % fname.rsplit('.', 1)[0]
            pyutil.to_tsv(resDF, ofname)
            return ofname
        else:
            return resDF
예제 #2
0
파일: jobs.py 프로젝트: shouldsee/synotil
def job__combinePeak(
    bwCurr,
    featSummit='/home/feng/ref/Arabidopsis_thaliana_TAIR10/annotation/genes.gtf.cds.summit',
    GSIZE='/home/feng/ref/Arabidopsis_thaliana_TAIR10/genome.sizes',
    CUTOFF=4000,
    head=1000,
    alias='testPeaks',
    center_summit=1,
):

    bwCurr = bwCurr.dropna(subset=['npkFile'])
    bwCurr['npkFileLine'] = bwCurr.eval("npkFile.map(@pyutil.lineCount)")
    print(bwCurr[['bname', 'npkFileLine']].sort_values('bname'))

    dfs = zip(bwCurr.index, map(sdio.extract_peak, bwCurr.npkFile))
    dfs = dict(dfs)

    dfs = {k: v.sort_values('score', ascending=False) for k, v in dfs.items()}
    bed = pd.concat([df.head(head) for df in dfs.values()], axis=0, sort=False)
    bed = bed.dropna(axis=1)

    ofname = pyutil.to_tsv(bed, '%s__combined.bed' % alias)
    if (len(dfs) > 1) and center_summit:
        ofname = sdio.bed__merge(ofname, silent=0)
    # pyutil.print
    print(ofname, pyutil.lineCount(ofname))

    bedFile = ofname

    #     peakSummit = sdio.bed__summit(bedFile)
    peakSummit = sdio.npk_expandSummit(fname=bedFile,
                                       radius=1,
                                       center_summit=center_summit)
    peak2geneFile = sdio.job__nearAUG(featSummit=featSummit,
                                      peakSummit=peakSummit,
                                      GSIZE=GSIZE,
                                      peakWid=1,
                                      CUTOFF=CUTOFF)

    pyutil.fileDict__save('files.json',
                          d=locals(),
                          keys=['bedFile', 'peakSummit', 'peak2geneFile'])
    return 'files.json'
예제 #3
0
파일: jobs.py 프로젝트: shouldsee/synotil
def job__chipTargPaired(
    bwCurr=None,
    bwMeta=None,
    control=None,
    treatment=None,
    xlab=None,
    ylab=None,
    name=None,
    #     bwMeta,
    NCORE=2,
    params__peakBW=None,
    CUTOFF_FC=3.0,
    CUTOFF_CHIPDIFF=0.7,
    innerRadius=100,
):
    figs = pyutil.collections.OrderedDict()

    if control is not None and treatment is not None:
        xlab, ylab = control, treatment
    if xlab is None or ylab is None:
        xlab, ylab = bwCurr.index
    elif bwCurr is None:
        bwCurr = bwMeta.reindex([xlab, ylab])

    if params__peakBW is None:

        params__peakBW = dict(
            outerRadius=500,
            innerRadius=innerRadius,
            NCORE=NCORE,
            outIndex=bwCurr.header,
            #     detailByCHIP = 0,
        )
    params__peakBW['innerRadius'] = innerRadius

    if name is None:
        name = '{xlab}-{ylab}'.format(**locals())
#     bwCurr = bwMeta
#     bwCurr = bwCurr.loc[[xlab,ylab]]

#     bwCurr.npkFile

    dfs = map(
        sdio.extract_peak,
        bwCurr.npkFile,
    )

    fig, ax = plt.subplots(1, 1, figsize=[7, 7])
    #     ax = plt.gca()
    for df in dfs:
        df['per_FC'] = pyutil.dist2ppf(df.FC)
        df.plot.scatter('per_FC', 'FC', ax=ax)

    fnames = [
        pyutil.queryCopy(infile=fname,
                         query='FC>%.3f' % CUTOFF_FC,
                         reader=sdio.extract_peak,
                         inplace=False) for fname in bwCurr.npkFile
    ]
    # dfs[1]

    peakFlat = ' '.join(fnames)
    ofname = '%s-combined.bed' % ('-'.join(bwCurr.index))
    pyutil.shellexec('cat {peakFlat}>{ofname}'.format(**locals()))
    ofname = sdio.npk_expandSummit(fname=ofname, radius=1)

    pyutil.lineCount(ofname)
    peakFileOrig = peakFile = ofname

    res = sjob.figs__peakBW(peakFile=peakFile,
                            bwFiles=bwCurr.RPKMFile,
                            name=name,
                            **params__peakBW)
    figs.update(res[0])

    bwTrack, bwAvg = res[1]
    bwAvg.columns = bwAvg.columns.map(
        pyutil.df2mapper(bwCurr, 'header', 'index').get)
    #     .set_index('RPKMFile').loc[bwAvg.columns].
    #     bwAvg.columns = bwCurr.index

    xs, ys = bwAvg[[xlab, ylab]].values.T
    #     clu = None

    #     peakIndex = pyutil.df__pad(bwAvg).query(query).index
    clu = pd.DataFrame(pyutil.df__pad(bwAvg))
    query = ' val_{ylab} - val_{xlab} > {CUTOFF_CHIPDIFF} '.format(**locals())
    qsans = pyutil.sanitise_query(query)
    peakIndex = clu.query(query).index
    clu['clu'] = clu.eval('index in @peakIndex')

    stats = sdio.extract_peak(peakFile).set_index('acc', drop=0)
    stats['CHIPDIFF'] = clu.eval(query.split('>')[0])

    pyvis.qc_2var(xs, ys, clu=clu.clu, xlab=xlab, ylab=ylab)
    figs['scatterPlot__%s' % name] = plt.gcf()
    cluFile = ofname = qsans + '.csv'
    clu.to_csv(ofname)
    print(ofname, pyutil.lineCount(ofname))
    peakBase = pyutil.getBname(peakFile)
    ofname = '{peakBase}-{qsans}.bed'.format(**locals())
    peakFile = pyutil.to_tsv(stats.reindex(peakIndex), ofname)
    pyutil.shellexec('mkdir -p output/')
    pyutil.file__link(ofname, 'output/%s.bed' % name, force=True)

    #     peakFile = pyutil.queryCopy(peakFile,
    #                                 query='acc in @peakIndex',
    #                                 reader=sdio.extract_peak,
    #                                 peakIndex=peakIndex,
    #                                )
    #     peakFile =  '{peakFile}-{qsans}.bed'
    #     pyutil.fileDict__main(ofname='FILE.json',
    #                          **pyutil.dictFilter(locals(),
    #                                              keys=['cluFile','peakFile',
    #                                             'peakFileOrig']
    #                                             ))

    pyutil.fileDict__save(d=locals(),
                          keys=['cluFile', 'peakFile', 'peakFileOrig'],
                          fname='FILE.json')
    return figs, clu
예제 #4
0
파일: dio.py 프로젝트: shouldsee/synotil
def clu2bed(segDF, ofname=None):
    '''Must have columns: ('acc','pos','clu')
    '''
    segDF = segDF.reset_index()
    #     stdout,isFile = get__stdout(ofname)
    stepSize = np.diff(segDF['pos'].values[:2], axis=0)[0]
    vals = segDF[['clu', 'acc']].values
    isDiff = (vals[1:] != vals[:-1]).any(axis=1)
    segDF['isDiff'] = np.concatenate([[True], isDiff], axis=0)
    it = (pyutil.util_obj(**vars(x)) for x in segDF.itertuples())
    peak = pyutil.collections.OrderedDict((
        ('chrom', None),
        ('start', None),
        ('end', None),
        ('acc', None),
    ))
    peaks = []

    def savePeakStart():
        peak['chrom'] = rec.acc
        peak['start'] = rec.pos
        return

    def savePeakEnd():
        #         kk = loc
        peak['end'] = oldPos + stepSize
        peak['acc'] = 'summitPos%d' % ((peak['start'] + peak['end']) // 2)
        assert peak['end'] > peak['start'], peak
        #         pyutil.ppJson(locals())
        peaks.append(peak.copy())

        #         line = u'\t'.join(map(unicode,peak.values()))
        #         stdout.write(u'%s\n'%line)

        #         print peak
        return

    def changed():
        if idx != 0:
            if oldClu == 1:
                savePeakEnd()
            if rec.clu == 1:
                if (oldClu == 0) | (oldAcc != rec.acc):
                    savePeakStart()
        else:
            if rec.clu == 1:
                savePeakStart()
        return

    #### Starting the loop
    oldClu = 0
    for idx, rec in enumerate(it):
        if (idx == 0):
            changed()
        elif (rec.clu != oldClu) or (rec.acc != oldAcc):
            changed()
        oldClu = rec.clu
        oldPos = rec.pos
        oldAcc = rec.acc
    changed()

    resDF = pd.DataFrame(peaks)

    if ofname is not None:
        try:
            pyutil.to_tsv(
                resDF,
                ofname,
            )
            return ofname
        except Exception as e:
            print e
    return resDF
예제 #5
0
파일: dio.py 프로젝트: shouldsee/synotil
def bed__embed(outerBed, innerBed, debug=0, ofname=None, mergeAcc=1):
    if not isinstance(outerBed, pd.DataFrame):
        outerBed = sdio.extract_peak(outerBed)
    if not isinstance(innerBed, pd.DataFrame):
        innerBed = sdio.extract_peak(innerBed)


#             lc[key] = sdio.extract_peak(val)
    assert 'acc' in outerBed, 'Reference bed file must be named'
    if 'acc' in innerBed:
        if mergeAcc:
            innerBed['acc'] = pyutil.df__paste0(innerBed, ['chrom', 'acc'],
                                                sep='_').tolist()
    for df in [outerBed, innerBed]:
        if 'strand' not in df.columns:
            df['strand'] = '+'
        df.strand.fillna('+')
        df['strandVal'] = df.strand.isin(['+', '*'])
    res = innerBed.merge(
        outerBed,
        how='left',
        left_on='chrom',
        right_on='acc',
        #                          prefixes=['t','1'],
        suffixes=['Inner', 'Outer'],
    )
    res['strandValFinal'] = ~((res['strandValOuter']) ^
                              (res['strandValInner']))
    isNeg = res.strandValFinal == 0
    (res.loc[ isNeg,'startInner'], res.loc[isNeg,'endInner']) \
        = (-res.loc[isNeg,'endInner'],-res.loc[isNeg,'startInner'])
    res['valid'] = res.eval('startInner<=endInner')
    assert res.valid.all()
    #     print 'acc' in res.columns
    if debug == 2:
        return res

    if debug == 1:
        return res[[
            'valid', 'chromInner', 'startInner', 'endInner', 'strandValFinal',
            'strandValInner', 'strandValOuter'
        ]]
    res['shift'] = 0
    isOuterNeg = res.strandValOuter == 0
    res.loc[isOuterNeg, 'shift'] = res.loc[isOuterNeg, 'endOuter']
    res.loc[~isOuterNeg, 'shift'] = res.loc[~isOuterNeg, 'startOuter']
    #     res.shift = r
    res['start'] = res['shift'] + res['startInner']
    res['end'] = res['shift'] + res['endInner']
    res['chrom'] = res['chromOuter']
    res['acc'] = res['accInner']
    resDF = res[[
        'chrom', 'start', 'end', 'acc'
        #                      'accInner'
    ]]
    if ofname is not None:
        try:
            pyutil.to_tsv(
                resDF,
                ofname,
            )
            return ofname
        except Exception as e:
            print e
    return resDF
예제 #6
0
def main(
        #### necessary
        bedFile=None,
        bwFiles=None,
        ####
        DIR=None,
        figsize=[14, 14],
        debug=0,
        ylim=[0, 10],
        radius=2000,
        stepSize=10,
        NCORE=4,
        silent=0,
        gtfFile=None,
        cdsFile=None,
        annotation=None,
        GSIZE=None,
        center_summit=0,
        trackNames=None,
        backend='fluff',
        ext='png',
        **kwargs):
    #     vlim = ylim
    figsize = map(int, figsize)
    # for peakAcc in df_near.acc.unique()[:1]:

    prefix = 'PROG=chipShots_bedFile='
    #     bname  = pyutil.basename(bedFile)
    bname = pyutil.os.path.basename(bedFile)
    odname = prefix + bname
    if DIR == 'inplace':
        DIR = pyutil.os.path.dirname(bedFile) + odname
    elif DIR is None:
        DIR = odname
    pyutil.shellexec('mkdir -p %s' % DIR, silent=silent)
    DIR = pyutil.os.path.abspath(DIR)
    #     odname = pyutil.ospath

    if cdsFile is None:
        cdsFile = gtfFile + '.cds'
    if backend == 'synotil':
        # nearFile = './DE2017/type=closest_bed=lux22_radius=1_feat=genes.gtf.cds.tsv'
        # import synotil.filterByCDS
        nearFile = synotil.filterByCDS.main(
            peakFile=bedFile,
            cdsFile=cdsFile,
            downStream=radius,
            upStream=radius,
            peakRadius=1,
            GSIZE=GSIZE,
            center_summit=center_summit,
        )
        df_near = pyutil.readData(nearFile, )

        stderrLine('[MSG]Loading bed intervals from bigwig tracks....')

        chipTracks = sutil.extract_bigwig_multiple(
            fnames=bwFiles,
            bedFile=bedFile,
            radius=radius,
            stepSize=stepSize,
            callback=None,
            outIndex=trackNames,

            #                                               callback=callback,
            center_summit=center_summit,
            shift=0,  #### use positive coordinate
            stranded=False,
            NCORE=NCORE)
        if ylim is None:
            ylim = pyutil.span(
                pyutil.np.hstack([x.values.flat for x in chipTracks]), 99)
            ylim = list(ylim)
            ylim[0] = 0.
        callback = lambda x: [prepare_chipTrack(ele, vlim=ylim) for ele in x]
        chipTracks = callback(chipTracks)

        if debug:
            stderrLine(chipTracks[0].columns)

        gtf = pyutil.readData(gtfFile, ext='tsv', header=None, guess_index=0)
        gtf = scount.countMatrix(gtf, look='gtf')
        gtfs = [gtf]

        #     uniqPeak = df_near.acc.unique()
        #     bedDF = pyutil.readData(bedFile,header=None,guess_index=0)
        #     bedDF.columns = sutil.bedHeader[:len(bedDF.columns)]
        bedDF = sutil.extract_peak(bedFile)
        #     uniqPeak
        #     uniqPeak = bedDF[bedDF.columns]

        worker = pyutil.functools.partial(
            worker__drawPeak,
            DIR=DIR,
            chipTracks=chipTracks,
            df_near=df_near,
            gtfs=gtfs,
            radius=radius,
            figsize=figsize,
            ylim=ylim,
            debug=debug,
        )

        ofnames = pyutil.mp_map(
            worker,
            bedDF.acc,
            n_cpu=NCORE,
        )
    elif backend == 'fluff':
        bedDF = sdio.extract_peak(bedFile)

        argDF = bedDF.copy()
        argDF = sdio.bed__addCol__interval(argDF)
        tracks = list(bwFiles)
        argDF['tracks'] = [tracks] * len(bedDF)
        argDF['annotation'] = annotation
        argDF['DIR'] = DIR
        argDF['ext'] = ext
        if trackNames is not None:
            argDF['labels'] = [list(trackNames)] * len(bedDF)

        ofnames = pyutil.mp_map(
            #         ofnames = map(
            worker__fluff,
            (vars(x) for x in argDF.itertuples()),
            n_cpu=NCORE,
        )
#         ofnames =

    bedDF['img'] = ofnames
    indexFile = '%s/%s.index.tsv' % (DIR, bname)
    pyutil.to_tsv(bedDF, indexFile)
    indexFile = '%s/figureIndex.tsv' % (DIR)
    pyutil.to_tsv(bedDF, indexFile)

    try:
        import synotil.shot2html as shot2html
        htmlFile = shot2html.shot2html(indexFile, localPath=True)
    except Exception as e:
        stderrLine('[WARN]:cannot produce html :%s' % e)
        htmlFile = None


#     print ('[OUTPUT]:',)
#     print ('html:',htmlFile)
#     print ('index:',indexFile)
    print(indexFile)
    print(htmlFile)
    return (indexFile, htmlFile)