Пример #1
0
def html_scatter(fig, ax, h, x_col, y_col, title, labels='dflt', outpath='dflt'):
    outpath = gt.check_dfltarg(outpath, gt.dflt_outpath('foo'))
    labels = gt.check_dfltarg(labels, h.label)
    scatter = ax.scatter(h[x_col].tolist(), h[y_col].tolist(), alpha=0.001)
    tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
    mpld3.plugins.connect(fig, tooltip)
    myoutpath = os.path.join(outpath, title + '.html')
    mpld3.save_html(fig, myoutpath)
Пример #2
0
def run_methods(df, h, outpath='dflt', hdr='dflt', hue='name', mets='dflt', shape=None, labels='dflt', scaling=None):
    """ run a selection of alternate dimension reduction techniques"""
    method_results = dict()

    n_neighbors = 15

    try:
        hdr = gt.check_dfltarg(hdr, df.name)
    except:
        hdr = df.columns[0].split(':')[0]
    outpath = gt.check_dfltarg(outpath, gt.dflt_outpath(fldr_name='dim_reduct'))
    labels = gt.check_dfltarg(labels, h.label)
    mets = gt.check_dfltarg(mets, ['PCA','ISO','MDS','LLE'])

    if ':' in df.columns.values[0]:
        df = df.T

    if scaling is not None:
        if scaling == 'maxabs':
            df = MaxAbsScaler().fit_transform(df)
        elif scaling == 'robust':
            df = RobustScaler().fit_transform(df)
        elif scaling == 'std':
            df = StandardScaler(with_mean=False).fit_transform(df)
        else:
            print('error with scaling')

    if 'PCA' in mets:
        # Projection on to the first 2 principal components
        method_results['PCA'] = decomposition.PCA(n_components=2).fit_transform(df)

    if 'ISO' in mets:
        # Isomap projection of the digits dataset
        method_results['ISO'] = manifold.Isomap(n_neighbors, n_components=2).fit_transform(df)

    if 'MDS' in mets:
        # MDS  embedding of the digits dataset
        method_results['MDS'] = manifold.MDS(n_components=2, n_init=1, max_iter=100).fit_transform(df)

    if 'LLE' in mets:
        # Locally linear embedding
        method_results['LLE'] = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                                            method='modified').fit_transform(df)

    for met, dat in method_results.items():
        xcol, ycol = f'{met}_x', f'{met}_y'
        h[xcol] = dat[:, 0]
        h[ycol] = dat[:, 1]

        title = hdr + ' ' + met

        seaborn_scatter(h, title, outpath, hue=hue, x=xcol, y=ycol, labels=labels, shape=shape, legend='brief')
Пример #3
0
def enrich_matrix(df, h, labels='dflt', type='scaled'):
    """ from a passed dataframe, calculate pairwise enrichment
    auto thresholds of features above zscore limited by number """
    zsthresh = 2
    numthresh = 50
    en_mtrx = pd.DataFrame()
    labels = gt.check_dfltarg(labels, h.label)
    for sample, label in zip(df.columns, h.label.values):
        # determine thresholds
        (up, dn) = sigs.get_sig(df[sample], zsthresh, numlim=numthresh)

        enres = sigs.bulk_test_enrich((up, dn), df, h)

        en_mtrx = pd.concat([en_mtrx, enres[type]], axis=1)

    en_mtrx.columns = h.label
    en_mtrx.set_index(h.label, inplace=True)
    return en_mtrx
Пример #4
0
def run_plate_analysis(mode='ind', cats='nd', path='dflt'):
    """ runs standard analysis on either each plate individually 'ind' or all togegther 'comb'
    most useful for plates with doses. the default loc  

    default path will be newQC on the desktop """

    path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC'))

    fl = gt.globit(path, '*ZSVCQNORM*')

    print(fl)

    if mode == 'comb':
        dl, hl = [], []
        for i, f in enumerate(fl):
            d, h = gct.extractgct(f)
            if i == 0:
                try:
                    pname = d.name + '+'
                except:
                    pname = h.addr[0].split(':')[0] + '+'
            if len(h.batch.unique()) > 1:
                # fix sample labels for plate/batch
                h.plate = h.plate + h.batch
            # define labels (should I add plate?)
            h = gt.gen_label(h, cats)
            dl.append(d)
            hl.append(h)
        try:
            d = pd.concat(dl, axis=1)
            d.name = pname
        except ValueError:
            sys.exit('no gct file plates to analyze')
        h = pd.concat(hl, axis=0)

        analyze_plate(d, h, cats)

    elif mode == 'ind':
        for f in fl:
            d, h = gct.extractgct(f)
            # define labels (should I add plate?)
            h = gt.gen_label(h, cats)

            analyze_plate(d, h, cats)
Пример #5
0
def get_vehicle_matrix(path='dflt', batch='all', delim=':', getcells=False):
    """" for the path load all files and collapse vehicles, plot matrix
    batches can be all or 'A' only to just take the first one. getcells will re-predict cells """
    path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC'))

    flv = gt.globit(path, '*Qctrl*')
    if len(flv) == 0:
        flv = gt.globit(path, '*_QNORM_*')

    # should put in a check to extract from regular qnorms
    dlist, hlist = [], []
    for f in flv:
        d, h = gct.extractgct(f)
        h['plate'] = h['plate'].apply(lambda x: x[:6])
        d, h = gt.dsub(d, h, {'type': 'vehicle'})
        if batch == 'all':
            for b in h.batch.unique():
                ds, hs = gt.dsub(d, h, {'batch': b})
                med = ds.median(axis=1)
                hs = gt.gen_label(hs, 'pb', delim=delim)
                dlist.append(med)
                hlist.append(hs.iloc[0])
        elif batch == 'A':
            ds, hs = gt.dsub(d, h, {'batch': 'A'})
            med = ds.median(axis=1)
            hs = gt.gen_label(hs, 'pb', delim=delim)
            dlist.append(med)
            hlist.append(hs.iloc[0])
        else:
            med = d.median(axis=1)
            hs = gt.gen_label(hs, 'p', delim=delim)
            dlist.append(med)
            hlist.append(hs.iloc[0])

    vdf = pd.concat(dlist, axis=1)
    vh = pd.DataFrame(hlist)
    vdf.columns = vh.label
    if getcells is True:
        vh['cell2'] = vh.label.apply(lambda x: predict_cells(vdf[x]))
        vh['label'] = vh.label + delim + vh.cell2
    vdf.columns = vh.label
    return vdf, vh
Пример #6
0
def find_plot_genes(df, thresh='dflt', lim=75):
    """ open a gct file and make list of genes which have ZS above the threshold, now updated to
     auto determine the list at 97.5% and above, also can input a hard value instead"""
    print(df.name)
    # auto determine threshold, currently 97.5
    thresh = gt.check_dfltarg(thresh, df.quantile(0.975).max())
    # apply filter
    subset = df[abs(df) > thresh]
    subset.dropna(axis=0, how='all', inplace=True)
    subset['max'] = subset.apply(lambda x: max(x.min(), x.max(), key=abs),
                                 axis=1)
    subset = subset.reindex(subset['max'].abs().sort_values(
        ascending=False, inplace=False).index)
    #give brief view of top and bottom vals
    print(subset['max'].head(n=6))
    print(subset['max'].tail(n=6))
    print(len(subset.index.values))
    if len(subset.index.values) > lim:
        result_list = subset.index.values[:lim]
    else:
        result_list = subset.index.values
    return result_list