Пример #1
0
def plot_gene(sample_set,
              h=None,
              name='dflt',
              outpath='dflt',
              close=True,
              width=8):
    """ basic plot gene finction, if header is provided will apply color coding blue = veh, red = poscon """
    if name == 'dflt':
        name = sample_set.name
    if outpath == 'dflt':
        outpath = gt.dflt_outpath(fldr_name='dflt')
    xrange = len(list(sample_set))
    dtype = check_plottype(sample_set)
    #print('dtype is ', dtype)
    ax = format_concentration_plot(xrange, ptype=dtype, width=width)
    ax.scatter(range(xrange), sample_set.values, color='grey')
    ax.set_title(name)
    if h is not None:
        h['order'] = np.arange(1, len(h) + 1)
        dv, hv = gt.dsub(sample_set, h, {'type': 'vehicle'})
        ax.scatter(hv.order, dv.values, color='blue')
        dp, hp = gt.dsub(sample_set, h, {'type': 'poscon'})
        ax.scatter(hp.order, dp.values, color='red')
    if close is True:
        plt.savefig(os.path.join(outpath, name + '.png'))
        plt.close()
    else:
        return ax
Пример #2
0
def plot_skylines(df, h, argdict, title='dflt'):
    """ pass a dataframe and argument dictionary to just plot skylines matching criteria
     if a dict key is passed with True as the arguments, it will plot all conditions from that category
     eg {'name':myname, 'dose':True } will plot all doses of that name """

    cats = [x[0] for x in argdict.keys()]
    cats = ''.join(cats)
    print(cats)

    myargdict = {}
    for k, v in argdict.items():
        if isinstance(v, bool) and v is True:
            continue
        myargdict[k] = v

    ds, hs = gt.dsub(df, h, myargdict)

    ddict = breakdown(ds, hs, cats, dic=True)

    if isinstance(title, str) and title == 'dflt':
        title = df.name

    for name, dat in ddict.items():
        if isinstance(dat, bool) and dat is True:
            continue
        mytitle = title + ' - ' + name
        skyline.new_skyline(dat, title=mytitle)
Пример #3
0
def predict_cells(input, save=False):
    """ can accept directory and loop through files or one dataframe at a time,
     uses v1.0 of the SVM classifier to consolidate reps to consensus and return prediction
     when save is True a dataframe will be saveh"""

    with open('/Users/WRB/Dropbox/bin/python/celllineclassifier.p',
              'rb') as file:
        clf = pickle.load(file)
    if isinstance(input, str):
        if os.path.isdir(input):
            vlist = gt.globit(input, '*_Qctrl_n*')
            if len(vlist) == 0:
                vlist = gt.globit(input, '*QNORM*')
        else:
            vlist = [input]
    elif isinstance(input, pd.Series):
        try:
            res = clf.predict([input])[0]
        except:
            print('error with series prediction')
            res = None
        return res
    else:
        vlist = input
    res_table = pd.DataFrame()
    for f in vlist:
        try:
            d, h = gct.extractgct(f)
        except:
            vlist[0] = d
            vlist[1] = h
        ds, hs = gt.dsub(d, h, {'type': 'vehicle'})
        if len(ds) == 0:
            print('error, maybe using ZS file? use QNORM instead')
            return None
        for b in hs.batch.unique():
            dsb, hsb = gt.dsub(ds, hs, {'batch': b})
            med = dsb.median(axis=1).values
            shn = gt.get_shn(f) + '-' + b
            res = clf.predict([med])[0]
            res_table.loc[shn, 'cell'] = res
            print(f'{shn} - {res}')
    if save is True:
        res_table.to_csv(gt.dflt_outpath(fn='cell_predictions.csv'), sep='\t')
    return res_table
Пример #4
0
def get_vehicle_matrix(path='dflt', batch='all', delim=':', getcells=False):
    """" for the path load all files and collapse vehicles, plot matrix
    batches can be all or 'A' only to just take the first one. getcells will re-predict cells """
    path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC'))

    flv = gt.globit(path, '*Qctrl*')
    if len(flv) == 0:
        flv = gt.globit(path, '*_QNORM_*')

    # should put in a check to extract from regular qnorms
    dlist, hlist = [], []
    for f in flv:
        d, h = gct.extractgct(f)
        h['plate'] = h['plate'].apply(lambda x: x[:6])
        d, h = gt.dsub(d, h, {'type': 'vehicle'})
        if batch == 'all':
            for b in h.batch.unique():
                ds, hs = gt.dsub(d, h, {'batch': b})
                med = ds.median(axis=1)
                hs = gt.gen_label(hs, 'pb', delim=delim)
                dlist.append(med)
                hlist.append(hs.iloc[0])
        elif batch == 'A':
            ds, hs = gt.dsub(d, h, {'batch': 'A'})
            med = ds.median(axis=1)
            hs = gt.gen_label(hs, 'pb', delim=delim)
            dlist.append(med)
            hlist.append(hs.iloc[0])
        else:
            med = d.median(axis=1)
            hs = gt.gen_label(hs, 'p', delim=delim)
            dlist.append(med)
            hlist.append(hs.iloc[0])

    vdf = pd.concat(dlist, axis=1)
    vh = pd.DataFrame(hlist)
    vdf.columns = vh.label
    if getcells is True:
        vh['cell2'] = vh.label.apply(lambda x: predict_cells(vdf[x]))
        vh['label'] = vh.label + delim + vh.cell2
    vdf.columns = vh.label
    return vdf, vh
Пример #5
0
Файл: qc.py Проект: wrbutton/foo
def assemble_ref_dat(path):
    """ to gather together all reference RNA wells within the given path """
    fl = gt.globit(path, '*_ref_n*')
    dl, hl = [], []
    for f in fl:
        dr, hr = gct.extractgct(f)
        dr, hr = gt.dsub(dr, hr, {'well':['A02','B02']})
        dr = round(dr, 2)
        dl.append(dr)
        hl.append(hr)
    alldata = pd.concat(dl, axis=1)
    return alldata
Пример #6
0
def plot_concentrations(df,
                        h,
                        genes='test2',
                        label=False,
                        mode='ind',
                        incr='dflt',
                        outpath='dflt',
                        fn='dflt',
                        maxx='dflt',
                        test=False):
    """ plotting concentration plots on a per-gene basis from a df/header
    outpath for figures is optionally specified, genes can be passed in as
    a list, left as 'test' for a single gene, or be 'all'.
    the mode= ind,med,avg will either plot individual reps w/ same x value or combine
    reps together using either 'med' median or 'avg' average across reps

    assumes broken down by name and dose, and only within one batch
    """
    # parametetrs controlling the optional labels below each cohort
    txt_args = {'fontsize': 8, 'rotation': 90, 'fontweight': 'bold'}
    genes = gt.get_genes(genes, df=df)
    # define outpath directory, create if necessary
    if outpath is 'dflt':
        outpath = os.path.join(gt.check_desktop(), 'output_figs')
    try:
        os.mkdir(outpath)
    except:
        pass
    # define title
    if fn is not 'dflt':
        name = fn
    else:
        try:
            name = df.name
        except AttributeError:
            name = h.index[0].split(':')[0]
    # set the color pallet and spacing/sizing levels (figsize tuned to these)
    cmap = plt.get_cmap('tab10')
    if incr == 'dflt':
        incr = 10
    sincr = 20

    # sort the sample wells in desired order, by name and dose for test
    d, h = gt.dsub(df, h, {'type': 'test'})
    df = d

    # create pert list for plot, strip batch if there's only one batch
    pert_list = []
    print(h['name'].unique())
    for n in h['name'].unique():
        pert_list.append('{}'.format(n))

    # if there are multiple reps adjust figure width to account
    # reps is for each name and dose combo, how many are there?
    #num_reps = round(h.groupby('name')['dose'].nunique().mean())
    ndoses = h.groupby('name')['dose'].nunique().max()
    nnames = h.name.nunique()
    print(name)
    print('num doses ', ndoses)
    print('name list ', len(pert_list))

    if isinstance(genes, str):
        genes = [genes]

    if maxx == 'dflt':
        # calc x range with length of vector corrected by reps, plus spacing btwn
        # basewidth = (len(d.iloc[0]) / num_reps) * incr
        # pert_buffer = (len(pert_list)) * 1 * incr
        pad = 8 * incr
        # maxx = basewidth + pert_buffer + pad
        maxx = (incr * nnames * ndoses) + (incr * 2 * nnames)

    for g in genes:
        # set initial color counters and x starting position
        ci = 0
        x_pos = 15
        # select vector for current gene
        dat = df.loc[g]
        # determine the max range of x axis
        maxv = round(max(abs(dat))) + 3
        ax = format_concentration_plot(maxx, maxy=maxv)
        ax.set_ylabel(g)
        mytitle = name + ' - ' + g
        print(mytitle)
        ax.set_title(mytitle)
        x_init = 0
        names = h['name'].apply(lambda x: str(x)).unique()
        for n in names:
            # increment through colors in cmap
            color = cmap(ci)
            ci += .1
            if ci > .9:
                ci = 0
            sub = h[h['name'] == n]
            doses = sorted(sub['dose'].unique(), key=lambda x: float(x))
            sizes = [(x + 1) * sincr for x in range(len(doses))]
            for d, s in zip(doses, sizes):
                args = {'name': n, 'dose': d}
                wids = gt.hsub(h, args).index.values
                y_vals = dat[wids].values
                if mode == 'avg':
                    y_vals = np.mean(y_vals)
                if mode == 'med':
                    y_vals = np.median(y_vals)
                try:
                    x_vals = [x_pos] * len(y_vals)
                except TypeError:
                    x_vals = x_pos
                # plot the current vals with specified color and size
                ax.scatter(x_vals, y_vals, c=color, s=s)
                x_pos += incr
            # put spacing between perts
            if label is True:
                # n = ' '.join([n, d])
                x_label = (x_init + x_pos) / 2
                ax.text(x_label, -(maxv + 1), n, color=color, **txt_args)
            x_pos += (incr * 2)
            x_init = x_pos
        plt.savefig(os.path.join(outpath, mytitle + '.png'),
                    bbox_inches='tight')
        plt.close()
        if test is True:
            print('test mode, exiting after one image')
            break
Пример #7
0
def plot_landmark_concs(df,
                        h,
                        maxy=12,
                        cats='n',
                        labels='dflt',
                        genes='test100',
                        outpath='dflt',
                        title='dflt',
                        dosenum='dflt',
                        test=False):
    """ plot many or all landmarks, should pass in a subset dataframe and header which
    should be the consensus ZS file. can contain many different names + doses, will auto breakdown by 'nd'
    a single line per gene is plotted for the ZS across all concentrations
     labels can be 'dflt' for just incr numbers, or 'wells' for address, or 'dose' for numbers """
    # txt_args = {'fontsize': 8,
    #             'rotation': 90,
    #             'fontweight': 'bold'}

    if outpath is 'dflt':
        outpath = gt.dflt_outpath()
    df, h = gt.dsub(df, h, {'type': 'test'})
    names = h.name.dropna().unique()
    doses = gt.hsub(h, {'name': names[0]})['dose'].dropna().unique()
    if len(gt.hsub(h, {'name': names[0], 'dose': doses[0]})) > 1:
        print('dataframe not collapsed to consensus, bogus lm concs')
        print(gt.hsub(h, {'name': names[1], 'dose': doses[0]}).head())
    for ds, hs in pa.breakdown(df, h, cats, dic=False):
        #hs['dose'] = pd.to_numeric(hs['dose'])
        hs.sort_values('dose', ascending=True, inplace=True)
        ds = ds[hs.index]
        xrange = len(hs.dose.unique()) - 2
        ax = format_concentration_plot(xrange, maxy=maxy, width=4)
        ax.tick_params(axis='x', bottom='on', top='off', labelbottom='on')
        if dosenum == 'dflt':
            dose_range = range(len(hs.dose.unique()))
        else:
            dose_range = range(dosenum)
        ax.set_xticks(dose_range)
        if labels == 'dflt':
            ax.set_xticklabels([str(x + 1) for x in dose_range])
        elif labels == 'wells':
            # temporary labels
            ax.set_xticklabels(hs.index, rotation=45)
        elif labels == 'dose':
            ax.set_xticklabels(hs['dose'].unique(), rotation=45)
        else:
            try:
                ax.set_xticklabels(labels)
            except:
                print('problem with x range labels')

        # set title and name
        if title == 'dflt':
            try:
                mytitle = df.name
            except:
                mytitle = hs['plate'].values[0]
        mytitle = mytitle.strip('_sub')
        suffix = ''
        for c in cats:
            cat = gt.cats_lookup(c)
            attr = hs[cat].values[0][0]
            suffix += f' - {attr}'
        mytitle += suffix

        ax.set_title(mytitle, fontsize=14)
        for g in gt.get_genes(genes, df=df):
            data = ds.loc[g, :]
            ax.plot(data.values, linewidth=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(outpath, mytitle + '.png'))
        plt.close()
        if test is True:
            print('stopping after one iteration')
            break
Пример #8
0
def breakdown(df, h, cats, dic=True, genes=None):
    """ takes a dataframe and header and the categories to break down by 'b' batch, 'c' cell, 'n' name, 'd' dose.
    returns a dictionary with the key as the description and the dataframe as the value.
    'w' is also supported as breakdown by well - useful for many plates with identical layout

    if dic is True a dictionary is returned, with a key title and dataframe value
    if dic is False then list is returned, of tuples with dataframe and header

    """

    if genes is not None:
        genes = gt.get_genes(genes)
        df = df.loc[genes]

    if 'd' in cats:
        try:
            dose_col = [
                x for x in h.columns if 'dose' in x or 'dilution' in x
            ][0]
        except IndexError:
            print('dose column error')
    else:
        dose_col = None

    vd = cll.OrderedDict()
    subs = []

    cd = {
        'c': 'cell',
        'b': 'batch',
        'd': dose_col,
        'n': 'name',
        'w': 'well',
        'p': 'plate'
    }

    clist = []

    for c in cats:
        try:
            clist.append(cd[c])
        except IndexError:
            print('error, more than 3 categories')

    cat1 = clist[0]
    group1 = sorted(h[cat1].dropna().unique())
    for e1 in group1:
        argdict = {cat1: e1}
        try:
            cat2 = clist[1]
            for e2 in sorted(gt.hsub(h, {cat1: e1})[cat2].dropna().unique()):
                argdict.update({cat2: e2})
                try:
                    cat3 = clist[2]
                    for e3 in sorted(
                            gt.hsub(h, {
                                cat1: e1,
                                cat2: e2
                            })[cat3].dropna().unique()):
                        argdict.update({cat3: e3})
                        hdr = f'{e1}-{e2}-{e3}'
                        if dic is True:
                            vd.update(
                                {hdr: gt.dosub(df, h, argdict, name=hdr)})
                        else:
                            subs.append(gt.dsub(df, h, argdict, name=hdr))
                except IndexError:
                    hdr = f'{e1}-{e2}'
                    if dic is True:
                        vd.update({hdr: gt.dosub(df, h, argdict, name=hdr)})
                    else:
                        subs.append(gt.dsub(df, h, argdict, name=hdr))
        except IndexError:
            hdr = f'{e1}'
            if dic is True:
                vd.update({hdr: gt.dosub(df, h, argdict, name=hdr)})
            else:
                subs.append(gt.dsub(df, h, argdict, name=hdr))

    if dic is True:
        return vd
    else:
        return subs