def plot_gene(sample_set, h=None, name='dflt', outpath='dflt', close=True, width=8): """ basic plot gene finction, if header is provided will apply color coding blue = veh, red = poscon """ if name == 'dflt': name = sample_set.name if outpath == 'dflt': outpath = gt.dflt_outpath(fldr_name='dflt') xrange = len(list(sample_set)) dtype = check_plottype(sample_set) #print('dtype is ', dtype) ax = format_concentration_plot(xrange, ptype=dtype, width=width) ax.scatter(range(xrange), sample_set.values, color='grey') ax.set_title(name) if h is not None: h['order'] = np.arange(1, len(h) + 1) dv, hv = gt.dsub(sample_set, h, {'type': 'vehicle'}) ax.scatter(hv.order, dv.values, color='blue') dp, hp = gt.dsub(sample_set, h, {'type': 'poscon'}) ax.scatter(hp.order, dp.values, color='red') if close is True: plt.savefig(os.path.join(outpath, name + '.png')) plt.close() else: return ax
def plot_skylines(df, h, argdict, title='dflt'): """ pass a dataframe and argument dictionary to just plot skylines matching criteria if a dict key is passed with True as the arguments, it will plot all conditions from that category eg {'name':myname, 'dose':True } will plot all doses of that name """ cats = [x[0] for x in argdict.keys()] cats = ''.join(cats) print(cats) myargdict = {} for k, v in argdict.items(): if isinstance(v, bool) and v is True: continue myargdict[k] = v ds, hs = gt.dsub(df, h, myargdict) ddict = breakdown(ds, hs, cats, dic=True) if isinstance(title, str) and title == 'dflt': title = df.name for name, dat in ddict.items(): if isinstance(dat, bool) and dat is True: continue mytitle = title + ' - ' + name skyline.new_skyline(dat, title=mytitle)
def predict_cells(input, save=False): """ can accept directory and loop through files or one dataframe at a time, uses v1.0 of the SVM classifier to consolidate reps to consensus and return prediction when save is True a dataframe will be saveh""" with open('/Users/WRB/Dropbox/bin/python/celllineclassifier.p', 'rb') as file: clf = pickle.load(file) if isinstance(input, str): if os.path.isdir(input): vlist = gt.globit(input, '*_Qctrl_n*') if len(vlist) == 0: vlist = gt.globit(input, '*QNORM*') else: vlist = [input] elif isinstance(input, pd.Series): try: res = clf.predict([input])[0] except: print('error with series prediction') res = None return res else: vlist = input res_table = pd.DataFrame() for f in vlist: try: d, h = gct.extractgct(f) except: vlist[0] = d vlist[1] = h ds, hs = gt.dsub(d, h, {'type': 'vehicle'}) if len(ds) == 0: print('error, maybe using ZS file? use QNORM instead') return None for b in hs.batch.unique(): dsb, hsb = gt.dsub(ds, hs, {'batch': b}) med = dsb.median(axis=1).values shn = gt.get_shn(f) + '-' + b res = clf.predict([med])[0] res_table.loc[shn, 'cell'] = res print(f'{shn} - {res}') if save is True: res_table.to_csv(gt.dflt_outpath(fn='cell_predictions.csv'), sep='\t') return res_table
def get_vehicle_matrix(path='dflt', batch='all', delim=':', getcells=False): """" for the path load all files and collapse vehicles, plot matrix batches can be all or 'A' only to just take the first one. getcells will re-predict cells """ path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC')) flv = gt.globit(path, '*Qctrl*') if len(flv) == 0: flv = gt.globit(path, '*_QNORM_*') # should put in a check to extract from regular qnorms dlist, hlist = [], [] for f in flv: d, h = gct.extractgct(f) h['plate'] = h['plate'].apply(lambda x: x[:6]) d, h = gt.dsub(d, h, {'type': 'vehicle'}) if batch == 'all': for b in h.batch.unique(): ds, hs = gt.dsub(d, h, {'batch': b}) med = ds.median(axis=1) hs = gt.gen_label(hs, 'pb', delim=delim) dlist.append(med) hlist.append(hs.iloc[0]) elif batch == 'A': ds, hs = gt.dsub(d, h, {'batch': 'A'}) med = ds.median(axis=1) hs = gt.gen_label(hs, 'pb', delim=delim) dlist.append(med) hlist.append(hs.iloc[0]) else: med = d.median(axis=1) hs = gt.gen_label(hs, 'p', delim=delim) dlist.append(med) hlist.append(hs.iloc[0]) vdf = pd.concat(dlist, axis=1) vh = pd.DataFrame(hlist) vdf.columns = vh.label if getcells is True: vh['cell2'] = vh.label.apply(lambda x: predict_cells(vdf[x])) vh['label'] = vh.label + delim + vh.cell2 vdf.columns = vh.label return vdf, vh
def assemble_ref_dat(path): """ to gather together all reference RNA wells within the given path """ fl = gt.globit(path, '*_ref_n*') dl, hl = [], [] for f in fl: dr, hr = gct.extractgct(f) dr, hr = gt.dsub(dr, hr, {'well':['A02','B02']}) dr = round(dr, 2) dl.append(dr) hl.append(hr) alldata = pd.concat(dl, axis=1) return alldata
def plot_concentrations(df, h, genes='test2', label=False, mode='ind', incr='dflt', outpath='dflt', fn='dflt', maxx='dflt', test=False): """ plotting concentration plots on a per-gene basis from a df/header outpath for figures is optionally specified, genes can be passed in as a list, left as 'test' for a single gene, or be 'all'. the mode= ind,med,avg will either plot individual reps w/ same x value or combine reps together using either 'med' median or 'avg' average across reps assumes broken down by name and dose, and only within one batch """ # parametetrs controlling the optional labels below each cohort txt_args = {'fontsize': 8, 'rotation': 90, 'fontweight': 'bold'} genes = gt.get_genes(genes, df=df) # define outpath directory, create if necessary if outpath is 'dflt': outpath = os.path.join(gt.check_desktop(), 'output_figs') try: os.mkdir(outpath) except: pass # define title if fn is not 'dflt': name = fn else: try: name = df.name except AttributeError: name = h.index[0].split(':')[0] # set the color pallet and spacing/sizing levels (figsize tuned to these) cmap = plt.get_cmap('tab10') if incr == 'dflt': incr = 10 sincr = 20 # sort the sample wells in desired order, by name and dose for test d, h = gt.dsub(df, h, {'type': 'test'}) df = d # create pert list for plot, strip batch if there's only one batch pert_list = [] print(h['name'].unique()) for n in h['name'].unique(): pert_list.append('{}'.format(n)) # if there are multiple reps adjust figure width to account # reps is for each name and dose combo, how many are there? #num_reps = round(h.groupby('name')['dose'].nunique().mean()) ndoses = h.groupby('name')['dose'].nunique().max() nnames = h.name.nunique() print(name) print('num doses ', ndoses) print('name list ', len(pert_list)) if isinstance(genes, str): genes = [genes] if maxx == 'dflt': # calc x range with length of vector corrected by reps, plus spacing btwn # basewidth = (len(d.iloc[0]) / num_reps) * incr # pert_buffer = (len(pert_list)) * 1 * incr pad = 8 * incr # maxx = basewidth + pert_buffer + pad maxx = (incr * nnames * ndoses) + (incr * 2 * nnames) for g in genes: # set initial color counters and x starting position ci = 0 x_pos = 15 # select vector for current gene dat = df.loc[g] # determine the max range of x axis maxv = round(max(abs(dat))) + 3 ax = format_concentration_plot(maxx, maxy=maxv) ax.set_ylabel(g) mytitle = name + ' - ' + g print(mytitle) ax.set_title(mytitle) x_init = 0 names = h['name'].apply(lambda x: str(x)).unique() for n in names: # increment through colors in cmap color = cmap(ci) ci += .1 if ci > .9: ci = 0 sub = h[h['name'] == n] doses = sorted(sub['dose'].unique(), key=lambda x: float(x)) sizes = [(x + 1) * sincr for x in range(len(doses))] for d, s in zip(doses, sizes): args = {'name': n, 'dose': d} wids = gt.hsub(h, args).index.values y_vals = dat[wids].values if mode == 'avg': y_vals = np.mean(y_vals) if mode == 'med': y_vals = np.median(y_vals) try: x_vals = [x_pos] * len(y_vals) except TypeError: x_vals = x_pos # plot the current vals with specified color and size ax.scatter(x_vals, y_vals, c=color, s=s) x_pos += incr # put spacing between perts if label is True: # n = ' '.join([n, d]) x_label = (x_init + x_pos) / 2 ax.text(x_label, -(maxv + 1), n, color=color, **txt_args) x_pos += (incr * 2) x_init = x_pos plt.savefig(os.path.join(outpath, mytitle + '.png'), bbox_inches='tight') plt.close() if test is True: print('test mode, exiting after one image') break
def plot_landmark_concs(df, h, maxy=12, cats='n', labels='dflt', genes='test100', outpath='dflt', title='dflt', dosenum='dflt', test=False): """ plot many or all landmarks, should pass in a subset dataframe and header which should be the consensus ZS file. can contain many different names + doses, will auto breakdown by 'nd' a single line per gene is plotted for the ZS across all concentrations labels can be 'dflt' for just incr numbers, or 'wells' for address, or 'dose' for numbers """ # txt_args = {'fontsize': 8, # 'rotation': 90, # 'fontweight': 'bold'} if outpath is 'dflt': outpath = gt.dflt_outpath() df, h = gt.dsub(df, h, {'type': 'test'}) names = h.name.dropna().unique() doses = gt.hsub(h, {'name': names[0]})['dose'].dropna().unique() if len(gt.hsub(h, {'name': names[0], 'dose': doses[0]})) > 1: print('dataframe not collapsed to consensus, bogus lm concs') print(gt.hsub(h, {'name': names[1], 'dose': doses[0]}).head()) for ds, hs in pa.breakdown(df, h, cats, dic=False): #hs['dose'] = pd.to_numeric(hs['dose']) hs.sort_values('dose', ascending=True, inplace=True) ds = ds[hs.index] xrange = len(hs.dose.unique()) - 2 ax = format_concentration_plot(xrange, maxy=maxy, width=4) ax.tick_params(axis='x', bottom='on', top='off', labelbottom='on') if dosenum == 'dflt': dose_range = range(len(hs.dose.unique())) else: dose_range = range(dosenum) ax.set_xticks(dose_range) if labels == 'dflt': ax.set_xticklabels([str(x + 1) for x in dose_range]) elif labels == 'wells': # temporary labels ax.set_xticklabels(hs.index, rotation=45) elif labels == 'dose': ax.set_xticklabels(hs['dose'].unique(), rotation=45) else: try: ax.set_xticklabels(labels) except: print('problem with x range labels') # set title and name if title == 'dflt': try: mytitle = df.name except: mytitle = hs['plate'].values[0] mytitle = mytitle.strip('_sub') suffix = '' for c in cats: cat = gt.cats_lookup(c) attr = hs[cat].values[0][0] suffix += f' - {attr}' mytitle += suffix ax.set_title(mytitle, fontsize=14) for g in gt.get_genes(genes, df=df): data = ds.loc[g, :] ax.plot(data.values, linewidth=0.3) plt.tight_layout() plt.savefig(os.path.join(outpath, mytitle + '.png')) plt.close() if test is True: print('stopping after one iteration') break
def breakdown(df, h, cats, dic=True, genes=None): """ takes a dataframe and header and the categories to break down by 'b' batch, 'c' cell, 'n' name, 'd' dose. returns a dictionary with the key as the description and the dataframe as the value. 'w' is also supported as breakdown by well - useful for many plates with identical layout if dic is True a dictionary is returned, with a key title and dataframe value if dic is False then list is returned, of tuples with dataframe and header """ if genes is not None: genes = gt.get_genes(genes) df = df.loc[genes] if 'd' in cats: try: dose_col = [ x for x in h.columns if 'dose' in x or 'dilution' in x ][0] except IndexError: print('dose column error') else: dose_col = None vd = cll.OrderedDict() subs = [] cd = { 'c': 'cell', 'b': 'batch', 'd': dose_col, 'n': 'name', 'w': 'well', 'p': 'plate' } clist = [] for c in cats: try: clist.append(cd[c]) except IndexError: print('error, more than 3 categories') cat1 = clist[0] group1 = sorted(h[cat1].dropna().unique()) for e1 in group1: argdict = {cat1: e1} try: cat2 = clist[1] for e2 in sorted(gt.hsub(h, {cat1: e1})[cat2].dropna().unique()): argdict.update({cat2: e2}) try: cat3 = clist[2] for e3 in sorted( gt.hsub(h, { cat1: e1, cat2: e2 })[cat3].dropna().unique()): argdict.update({cat3: e3}) hdr = f'{e1}-{e2}-{e3}' if dic is True: vd.update( {hdr: gt.dosub(df, h, argdict, name=hdr)}) else: subs.append(gt.dsub(df, h, argdict, name=hdr)) except IndexError: hdr = f'{e1}-{e2}' if dic is True: vd.update({hdr: gt.dosub(df, h, argdict, name=hdr)}) else: subs.append(gt.dsub(df, h, argdict, name=hdr)) except IndexError: hdr = f'{e1}' if dic is True: vd.update({hdr: gt.dosub(df, h, argdict, name=hdr)}) else: subs.append(gt.dsub(df, h, argdict, name=hdr)) if dic is True: return vd else: return subs