예제 #1
0
def routine_combineCSV(fnames, CUTOFF=1, idCol='Gene ID'):
    print '[PROG] Starting to readfile'
    dfs, (geneRef, geneValid) = combine_csv(fnames, CUTOFF=CUTOFF, idCol=idCol)
    print '[PROG] Finished to readfile'

    print '[PROG] Starting to pad'
    f = pyutil.functools.partial(padWithRef, ref=geneRef)
    lst = pyutil.mp_map(f, dfs, n_cpu=1)

    SHP = np.array([df.shape for df in lst])
    assert np.all(SHP == SHP[0:1]), 'Arrays not sharing shape:%s' % SHP
    gids = np.array([df['Gene ID'] for df in lst])
    assert np.all(gids == gids[0:1])
    print '[PROG] Finished padding'

    dfs = lst
    dfs = [df.iloc[geneValid.index] for df in dfs]
    return dfs, (geneRef, geneValid.reset_index().drop('index', 1))
예제 #2
0
파일: jobs.py 프로젝트: shouldsee/synotil
def job__md5sum(dfc, column='fname', n_cpu=1, **kwargs):
    worker = pyutil.functools.partial(worker__md5sum, column=column)
    it = (x.__dict__ for x in dfc.itertuples())
    res = pyutil.mp_map(worker, it, n_cpu=n_cpu, **kwargs)
    #     res = pd.DataFrame(res)
    return res
예제 #3
0
파일: dio.py 프로젝트: shouldsee/synotil
def extract_bigwig_multiple(bedFile=None,
                            peakFile=None,
                            bwFiles=None,
                            fnames=None,
                            radius=None,
                            callback=lambda x: pd.concat(x, axis=1),
                            NCORE=1,
                            stepSize=20,
                            outIndex=pyutil.basename,
                            center_summit=0,
                            **kwargs):
    '''extract_bigwig() for multiple bwFiles
'''
    #########################
    #### Legacy support #####
    if bedFile is None:
        assert peakFile is not None
        bedFile = peakFile
    else:
        assert peakFile is None

    if bwFiles is None:
        assert fnames is not None
        bwFiles = fnames
    #### Legacy support ####
    #########################

    if radius is not None:
        bedFile = npk_expandSummit(fname=bedFile,
                                   radius=radius,
                                   center_summit=center_summit)
    print '[L]', pyutil.lineCount(bedFile)

    #### Compute Matrix

    worker = pyutil.functools.partial(
        extract_bigwig,
        #         bwFile=fname,
        stepSize=stepSize,
        bedFile=bedFile,
        #         outIndex=outIndex,
        #         NCORE=1,
        **kwargs)
    bws = pyutil.mp_map(worker, bwFiles, n_cpu=NCORE)
    #     bws = [ for fname in fnames]

    for i, (bwFile, out) in enumerate(zip(bwFiles, bws)):
        replaceCol = None
        if outIndex is None:
            replaceCol = None
        else:
            if callable(outIndex):
                replaceCol = outIndex(bwFile)
            else:
                replaceCol = outIndex[i]
        if replaceCol is not None:
            #### replace outer index,replace with rename()
            #### [TEMP]
            tmp = out.T
            tmp['ind'] = replaceCol
            tmp.set_index('ind', append=1, inplace=True)
            tmp = tmp.reorder_levels(['ind', None])
            tmp.index.names = ['bwFile', 'pos']
            out = tmp.T
        out.index.name = 'acc'
        bws[i] = out

    dfc = bws

    #     dfc=pd.concat(bws,)
    #     dfc = pd.concat(bws,axis=1) #### abosorbed into callback()

    if callback is not None:
        dfc = callback(dfc)
    return dfc
예제 #4
0
def qc_silhouette(
        D,
        #                   nClu = 40
        method='complete',
        nClu=40,
        NCORE=10,
        silent=1,
        axs=None):
    Ds = D

    if D.ndim == 2:
        D = spdist.squareform(D, checks=0)
    if Ds.ndim != 2:
        Ds = spdist.squareform(Ds, checks=0)

    # D = D_mse
    Z = sphier.linkage(
        D,
        method=method,
    )
    lst = []
    clus = []
    nClus = range(0, nClu)
    worker = pyutil.functools.partial(worker_cut_tree, Z=Z)
    clus = pyutil.mp_map(worker, nClus, n_cpu=NCORE)

    worker = pyutil.functools.partial(worker_silhouette, Ds=Ds)
    #     lst = []
    lst = pyutil.mp_map(
        worker,
        clus,
        #                        n_cpu=min(NCORE,4)
        n_cpu=1)

    lst = [x for x in lst if x is not None]
    shind = lst
    shind = np.array(shind)
    if not silent:
        if axs is None:
            fig, axs = plt.subplots(1, 1, figsize=[6, 4])
            axs = [axs]
        ax = axs[0]
        X, Y = shind.T
        ax.plot(X, Y)
        # pyvis.abline()
        ax.set_xlim(left=0)
        wid = 3
        rid = wid // 2
        movavg = map(
            np.mean,
            pyutil.window(
                Y,
                n=wid,
                step=1,
            ),
        )
        ax.plot(X[rid:-rid], movavg, 'x--', label='Moving average')
        ax.grid(1)
        ax.legend()

    return shind
예제 #5
0
def main(
        #### necessary
        bedFile=None,
        bwFiles=None,
        ####
        DIR=None,
        figsize=[14, 14],
        debug=0,
        ylim=[0, 10],
        radius=2000,
        stepSize=10,
        NCORE=4,
        silent=0,
        gtfFile=None,
        cdsFile=None,
        annotation=None,
        GSIZE=None,
        center_summit=0,
        trackNames=None,
        backend='fluff',
        ext='png',
        **kwargs):
    #     vlim = ylim
    figsize = map(int, figsize)
    # for peakAcc in df_near.acc.unique()[:1]:

    prefix = 'PROG=chipShots_bedFile='
    #     bname  = pyutil.basename(bedFile)
    bname = pyutil.os.path.basename(bedFile)
    odname = prefix + bname
    if DIR == 'inplace':
        DIR = pyutil.os.path.dirname(bedFile) + odname
    elif DIR is None:
        DIR = odname
    pyutil.shellexec('mkdir -p %s' % DIR, silent=silent)
    DIR = pyutil.os.path.abspath(DIR)
    #     odname = pyutil.ospath

    if cdsFile is None:
        cdsFile = gtfFile + '.cds'
    if backend == 'synotil':
        # nearFile = './DE2017/type=closest_bed=lux22_radius=1_feat=genes.gtf.cds.tsv'
        # import synotil.filterByCDS
        nearFile = synotil.filterByCDS.main(
            peakFile=bedFile,
            cdsFile=cdsFile,
            downStream=radius,
            upStream=radius,
            peakRadius=1,
            GSIZE=GSIZE,
            center_summit=center_summit,
        )
        df_near = pyutil.readData(nearFile, )

        stderrLine('[MSG]Loading bed intervals from bigwig tracks....')

        chipTracks = sutil.extract_bigwig_multiple(
            fnames=bwFiles,
            bedFile=bedFile,
            radius=radius,
            stepSize=stepSize,
            callback=None,
            outIndex=trackNames,

            #                                               callback=callback,
            center_summit=center_summit,
            shift=0,  #### use positive coordinate
            stranded=False,
            NCORE=NCORE)
        if ylim is None:
            ylim = pyutil.span(
                pyutil.np.hstack([x.values.flat for x in chipTracks]), 99)
            ylim = list(ylim)
            ylim[0] = 0.
        callback = lambda x: [prepare_chipTrack(ele, vlim=ylim) for ele in x]
        chipTracks = callback(chipTracks)

        if debug:
            stderrLine(chipTracks[0].columns)

        gtf = pyutil.readData(gtfFile, ext='tsv', header=None, guess_index=0)
        gtf = scount.countMatrix(gtf, look='gtf')
        gtfs = [gtf]

        #     uniqPeak = df_near.acc.unique()
        #     bedDF = pyutil.readData(bedFile,header=None,guess_index=0)
        #     bedDF.columns = sutil.bedHeader[:len(bedDF.columns)]
        bedDF = sutil.extract_peak(bedFile)
        #     uniqPeak
        #     uniqPeak = bedDF[bedDF.columns]

        worker = pyutil.functools.partial(
            worker__drawPeak,
            DIR=DIR,
            chipTracks=chipTracks,
            df_near=df_near,
            gtfs=gtfs,
            radius=radius,
            figsize=figsize,
            ylim=ylim,
            debug=debug,
        )

        ofnames = pyutil.mp_map(
            worker,
            bedDF.acc,
            n_cpu=NCORE,
        )
    elif backend == 'fluff':
        bedDF = sdio.extract_peak(bedFile)

        argDF = bedDF.copy()
        argDF = sdio.bed__addCol__interval(argDF)
        tracks = list(bwFiles)
        argDF['tracks'] = [tracks] * len(bedDF)
        argDF['annotation'] = annotation
        argDF['DIR'] = DIR
        argDF['ext'] = ext
        if trackNames is not None:
            argDF['labels'] = [list(trackNames)] * len(bedDF)

        ofnames = pyutil.mp_map(
            #         ofnames = map(
            worker__fluff,
            (vars(x) for x in argDF.itertuples()),
            n_cpu=NCORE,
        )
#         ofnames =

    bedDF['img'] = ofnames
    indexFile = '%s/%s.index.tsv' % (DIR, bname)
    pyutil.to_tsv(bedDF, indexFile)
    indexFile = '%s/figureIndex.tsv' % (DIR)
    pyutil.to_tsv(bedDF, indexFile)

    try:
        import synotil.shot2html as shot2html
        htmlFile = shot2html.shot2html(indexFile, localPath=True)
    except Exception as e:
        stderrLine('[WARN]:cannot produce html :%s' % e)
        htmlFile = None


#     print ('[OUTPUT]:',)
#     print ('html:',htmlFile)
#     print ('index:',indexFile)
    print(indexFile)
    print(htmlFile)
    return (indexFile, htmlFile)