def get_s2n_genes(g, c1, c2): d, h = gct.extractgct(g) c1w = gt.hsub(h, {'well': c1}).index c2w = gt.hsub(h, {'well': c2}).index d1 = d[c1w] d2 = d[c2w] res = sig_to_noise(d1, d2) return res
def ctup(df, arg_dict, col=':', u=True, lst=False): if lst is True and u is True: r = gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna().unique() elif lst is False and u is True: r = len(gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna().unique()) elif lst is True and u is False: r = gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna() elif lst is False and u is False: r = len(gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna()) return r
def summarize_doses(h): # returns series of names and lists of their unique doses as values h.sort_values(['batch', 'name', 'dose']) try: res = gt.hsub(h, { 'type': 'test' }).sort_values('dose')['dose'].groupby(h['name']).unique() except: res = gt.hsub(h, {'type': 'test'})['dilution'].groupby( h['name']).unique() return res
def ctup(df, arg_dict, col=':', u=True, lst=False): """ count up, check to see how many entries in df satisfy the argdict """ if lst is True and u is True: r = gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna().unique() elif lst is False and u is True: r = len(gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna().unique()) elif lst is True and u is False: r = gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna() elif lst is False and u is False: r = len(gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna()) return r
def get_well_reps(h, well, cats, df=False): """ return list of well addresses of other wells in the header file matching the provided categories as the passed well. n=name, d=dose, c=cell, b=batch returns list of addrs unless df is True, then it passes bach header dataframe of those wells""" args = [] if len(well) == 3: well = h.index[0][:-3] + well if 'n' in cats: args.append('name') if 'd' in cats: args.append('dose') if 'b' in cats: args.append('batch') if 'c' in cats: args.append('cell') argdict = {} try: mywell = h.loc[well] except KeyError: print(f'{well} well not found in index') return 'empty' print(mywell) for a in args: argdict[a] = mywell[a] matches = gt.hsub(h, argdict) if df is True: return matches else: return list(matches.index.values)
def check_data(path='dflt'): """ a better final map checker """ if path == 'dflt': path = gt.dflt_outpath(fldr_name='finaldata') flist = gt.get_flist(path, ext='.gct') maplist = gt.get_flist(path, ext='.txt') maplist.extend((gt.get_flist(path, ext='.xlsx'))) for f in flist: shn = gt.get_shn(f).split('.')[0] try: mapfile = [x for x in maplist if shn in x][0] except: print(f'error with map file {shn}') g = gct.Gct(f) g.get_headers() g.get_wells() datwells = g.wells mymap = gct.extractmap(mapfile) mapwells = gt.hsub(mymap, {'type':['vehicle', 'poscon', 'test']})['well'].values res = set(datwells) - set(mapwells) if len(res) == 0: print(f'{shn} ok, {380-len(datwells)} failed wells') else: print(f'eror with map/data {shn}, {len(datwells)}/{len(mapwells)}')
def make_barview_range(edf, argdict, across='dose', label=False, outpath=False): """ with enrichment score results, plot barviews across the range of conditions, default dose """ cond_range = sorted(gt.hsub(edf, argdict)[across].unique()) print(argdict.values()) mytitle = ' '.join(argdict.values()) fig, axarr = plt.subplots(1, len(cond_range), sharey='row') for i, cond in enumerate(cond_range): my_ax = axarr[i] new_argdict = argdict if across is not None: new_argdict[across] = cond if label is True: make_barview(edf, new_argdict, ax=my_ax, label=cond) else: make_barview(edf, new_argdict, ax=my_ax) #fig.subplots_adjust(hspace=0.5) plt.suptitle(mytitle) plt.tight_layout() plt.subplots_adjust(top=0.9) if outpath is True: outpath = gt.dflt_outpath(fldr_name='foo') myoutpath = os.path.join(outpath, mytitle + '_enrich.png') plt.savefig(myoutpath) plt.close()
def batch_summary(file): """ summarizes identities of each batch in a plate map, one well per batch """ m = gct.openmap(file) batches = m['batch'].dropna().unique() res = [] for b in batches: res.append(gt.hsub(m, {'batch': b}).iloc[3]) res = pd.concat(res, axis=1) return res.T
def make_barview(edf, argdict, ax=None, label=False, height=2): """ given an edf header file with enrichment info, and an argument dictionary for highlighted intances within the barview. passed label will be on left hand side the height of the highlighted instances can be sensitive to being swaamped out and invisible """ my_ax = ax ax = format_barview_plot(edf, ax=my_ax) if label is True: ax.set_ylabel(list(argdict.values())[0], labelpad=0.0) elif label is not False: ax.set_ylabel(label, labelpad=0.0) edf.sort_values(['scaled', 'up'], ascending=False, inplace=True) edf['plot_pos'] = list(range(len(edf) + 2, 2, -1)) edf['rank'] = list(range(1, len(edf) + 1, 1)) pos = edf[edf['scaled'] > 0] null = edf[edf['scaled'] == 0] neg = edf[edf['scaled'] < 0] selected = gt.hsub(edf, argdict) sslist = [pos, null, neg, selected] clist = ['lime', 'lightgrey', 'red', 'black'] # height of the highlight bar can be sensitive for subset, color in zip(sslist, clist): ax.barh(subset['plot_pos'], [1 * len(subset)], color=color, align='center', height=height) if my_ax is None: plt.tight_layout()
def plot_concentrations(df, h, genes='test2', label=False, mode='ind', incr='dflt', outpath='dflt', fn='dflt', maxx='dflt', test=False): """ plotting concentration plots on a per-gene basis from a df/header outpath for figures is optionally specified, genes can be passed in as a list, left as 'test' for a single gene, or be 'all'. the mode= ind,med,avg will either plot individual reps w/ same x value or combine reps together using either 'med' median or 'avg' average across reps assumes broken down by name and dose, and only within one batch """ # parametetrs controlling the optional labels below each cohort txt_args = {'fontsize': 8, 'rotation': 90, 'fontweight': 'bold'} genes = gt.get_genes(genes, df=df) # define outpath directory, create if necessary if outpath is 'dflt': outpath = os.path.join(gt.check_desktop(), 'output_figs') try: os.mkdir(outpath) except: pass # define title if fn is not 'dflt': name = fn else: try: name = df.name except AttributeError: name = h.index[0].split(':')[0] # set the color pallet and spacing/sizing levels (figsize tuned to these) cmap = plt.get_cmap('tab10') if incr == 'dflt': incr = 10 sincr = 20 # sort the sample wells in desired order, by name and dose for test d, h = gt.dsub(df, h, {'type': 'test'}) df = d # create pert list for plot, strip batch if there's only one batch pert_list = [] print(h['name'].unique()) for n in h['name'].unique(): pert_list.append('{}'.format(n)) # if there are multiple reps adjust figure width to account # reps is for each name and dose combo, how many are there? #num_reps = round(h.groupby('name')['dose'].nunique().mean()) ndoses = h.groupby('name')['dose'].nunique().max() nnames = h.name.nunique() print(name) print('num doses ', ndoses) print('name list ', len(pert_list)) if isinstance(genes, str): genes = [genes] if maxx == 'dflt': # calc x range with length of vector corrected by reps, plus spacing btwn # basewidth = (len(d.iloc[0]) / num_reps) * incr # pert_buffer = (len(pert_list)) * 1 * incr pad = 8 * incr # maxx = basewidth + pert_buffer + pad maxx = (incr * nnames * ndoses) + (incr * 2 * nnames) for g in genes: # set initial color counters and x starting position ci = 0 x_pos = 15 # select vector for current gene dat = df.loc[g] # determine the max range of x axis maxv = round(max(abs(dat))) + 3 ax = format_concentration_plot(maxx, maxy=maxv) ax.set_ylabel(g) mytitle = name + ' - ' + g print(mytitle) ax.set_title(mytitle) x_init = 0 names = h['name'].apply(lambda x: str(x)).unique() for n in names: # increment through colors in cmap color = cmap(ci) ci += .1 if ci > .9: ci = 0 sub = h[h['name'] == n] doses = sorted(sub['dose'].unique(), key=lambda x: float(x)) sizes = [(x + 1) * sincr for x in range(len(doses))] for d, s in zip(doses, sizes): args = {'name': n, 'dose': d} wids = gt.hsub(h, args).index.values y_vals = dat[wids].values if mode == 'avg': y_vals = np.mean(y_vals) if mode == 'med': y_vals = np.median(y_vals) try: x_vals = [x_pos] * len(y_vals) except TypeError: x_vals = x_pos # plot the current vals with specified color and size ax.scatter(x_vals, y_vals, c=color, s=s) x_pos += incr # put spacing between perts if label is True: # n = ' '.join([n, d]) x_label = (x_init + x_pos) / 2 ax.text(x_label, -(maxv + 1), n, color=color, **txt_args) x_pos += (incr * 2) x_init = x_pos plt.savefig(os.path.join(outpath, mytitle + '.png'), bbox_inches='tight') plt.close() if test is True: print('test mode, exiting after one image') break
def plot_landmark_concs(df, h, maxy=12, cats='n', labels='dflt', genes='test100', outpath='dflt', title='dflt', dosenum='dflt', test=False): """ plot many or all landmarks, should pass in a subset dataframe and header which should be the consensus ZS file. can contain many different names + doses, will auto breakdown by 'nd' a single line per gene is plotted for the ZS across all concentrations labels can be 'dflt' for just incr numbers, or 'wells' for address, or 'dose' for numbers """ # txt_args = {'fontsize': 8, # 'rotation': 90, # 'fontweight': 'bold'} if outpath is 'dflt': outpath = gt.dflt_outpath() df, h = gt.dsub(df, h, {'type': 'test'}) names = h.name.dropna().unique() doses = gt.hsub(h, {'name': names[0]})['dose'].dropna().unique() if len(gt.hsub(h, {'name': names[0], 'dose': doses[0]})) > 1: print('dataframe not collapsed to consensus, bogus lm concs') print(gt.hsub(h, {'name': names[1], 'dose': doses[0]}).head()) for ds, hs in pa.breakdown(df, h, cats, dic=False): #hs['dose'] = pd.to_numeric(hs['dose']) hs.sort_values('dose', ascending=True, inplace=True) ds = ds[hs.index] xrange = len(hs.dose.unique()) - 2 ax = format_concentration_plot(xrange, maxy=maxy, width=4) ax.tick_params(axis='x', bottom='on', top='off', labelbottom='on') if dosenum == 'dflt': dose_range = range(len(hs.dose.unique())) else: dose_range = range(dosenum) ax.set_xticks(dose_range) if labels == 'dflt': ax.set_xticklabels([str(x + 1) for x in dose_range]) elif labels == 'wells': # temporary labels ax.set_xticklabels(hs.index, rotation=45) elif labels == 'dose': ax.set_xticklabels(hs['dose'].unique(), rotation=45) else: try: ax.set_xticklabels(labels) except: print('problem with x range labels') # set title and name if title == 'dflt': try: mytitle = df.name except: mytitle = hs['plate'].values[0] mytitle = mytitle.strip('_sub') suffix = '' for c in cats: cat = gt.cats_lookup(c) attr = hs[cat].values[0][0] suffix += f' - {attr}' mytitle += suffix ax.set_title(mytitle, fontsize=14) for g in gt.get_genes(genes, df=df): data = ds.loc[g, :] ax.plot(data.values, linewidth=0.3) plt.tight_layout() plt.savefig(os.path.join(outpath, mytitle + '.png')) plt.close() if test is True: print('stopping after one iteration') break
def summarize_doses(h): # returns series of names and lists of their unique doses as values res = gt.hsub(h, {'type': 'test'})['dilution'].groupby(h['name']).unique() return res
def check_maps(path, compare=True, img=True, v=True, filt=True): """ looks through .txt and .xlsx maps in a directory and summarizes their content and relationship with each other, as well as generating plate visualizations of type and batch for each plate. V for verbose, lists names and doses per plate if filter is true, just observe 6character name excel files, otherwise consider all files""" # plot_fields = ['type', 'batch'] plot_fields = ['type'] # add checks to add batch and dose if present pert_dict, map_list = {}, {} wellpert_dict = {} flist = gt.get_flist(path, ext='.xlsx') if filt is True: flist = [x for x in flist if len(os.path.split(x)[-1]) == 11] if len(flist) == 0: flist = gt.get_flist(path, ext='.txt') flist = [x for x in flist if len(os.path.split(x)[-1]) == 10] if v is True: print('flist = ', flist) awells = gt.get_awells() composition = pd.DataFrame(columns=[ 'wells #', 'test #', 'doses', 'dose/trt', '# names', 'vehicle #', 'poscon #', 'poscons' ]) for f in flist: pname = gt.get_shn(f).split('.')[0] print(pname) if f.endswith('.xlsx'): m = pd.read_excel(f) elif f.endswith('.txt'): m = pd.read_table(f, index_col=False) m.sort_index(inplace=True) batches = m['batch'].dropna().unique() if any([('dose' in x) or ('dilution' in x) for x in m.columns]): dose_field = [ x for x in m.columns if (('dose' in x) or ('dilution' in x)) ][0] else: dose_field = None headers = { 'wells #': lambda x: len(x.index), 'test #': lambda x: ctup(x, {'type': 'test'}, 'well') } if dose_field is not None: headers.update({ 'doses': lambda x: ctup(x, {'type': 'test'}, dose_field), 'dose/trt': lambda x: gt.hsub(m, {'type': 'test'})[dose_field].groupby(m[ 'name']).unique().apply(lambda x: len(x)).mean() }) elif dose_field is None: headers.update({'doses': 'na', 'dose/trt': 'na'}) headers.update({ '# names': lambda x: ctup(x, {'type': 'test'}, 'name'), 'vehicle #': lambda x: ctup(x, {'type': 'vehicle'}, 'well'), 'poscon #': lambda x: ctup(x, {'type': 'poscon'}, 'well'), 'poscons': lambda x: ctup(x, {'type': 'poscon'}, 'name', lst=True) }) summary = pd.DataFrame(columns=headers) # check wells for full plate well_result = set(awells) - set(m['well'].values) if len(well_result) != 0: print('{} wells error, {} entries'.format(pname, len(m.index))) if v is True: print(gt.hsub(m, {'type': 'test'})['name'].dropna().unique()) try: doselist = gt.hsub( m, {'type': 'test'})[dose_field].dropna().unique() print(doselist) except: print('error with dose col, ', dose_field) pass # summarize the header info per batch, and assemble pert-lists # for the overlap comparisons for b in batches: entry = pname + '-' + b ms = gt.hsub(m, {'batch': b}) # gather pert names for overlap comparison pert_dict[entry] = ctup(m, { 'batch': b, 'type': 'test' }, 'name', lst=True) # get the well-pert identities for same plate comparison ms.loc[:, 'addr'] = ms['well'] + '-' + ms['name'].apply( lambda x: str(x)) wellpert_dict[entry] = ms['addr'].values for k in headers.keys(): try: summary.loc[entry, k] = headers[k](ms) except (KeyError, TypeError): summary.loc[entry, k] = 'na' composition = pd.concat([composition, summary]) if img is True: for pf in plot_fields: plot_series = m[pf] if len(plot_series.dropna().unique()) > 1: plot_series.name = pname + ' ' + pf plate_map_vis(plot_series, path=path) composition.to_excel(os.path.join(path, 'batch_composition.xlsx')) if compare is True: same_plates = gt.overlap_matrix(wellpert_dict.values(), wellpert_dict.keys()) name_overlap = gt.overlap_matrix(pert_dict.values(), pert_dict.keys()) name_overlap.to_excel(os.path.join(path, 'name_overlaps.xlsx')) same_plates.to_excel(os.path.join(path, 'well-name_overlaps.xlsx'))
def get_zscore(fpath, save=True, my_mad=None): """ merged from separate zscore file. can either save the resulting file or return data the first fpath argument can be a file path or a [d, h] object already""" # basic setup if isinstance(fpath, str): g = gct.Gct(fpath) g.get_headers() df, h = gct.extractgct(fpath) else: try: df = fpath[0] h = fpath[1] except: print('error with path') zsd = cll.defaultdict(dict) pname = gt.get_shn(fpath) for b in h['batch'].dropna().unique(): if b == 'na': continue print('running zscore for {} batch {}'.format(pname, b)) vw = gt.hsub(h, {'batch': b, 'type': 'vehicle'}).index.values if len(vw) == 0: break veh = df[vw] # get median value across vehicle populations med = veh.median(axis=1) # populate the absolute deviation values per gene ad = cll.defaultdict(list) for v in veh.columns: for f in veh.index: ad[f].append(abs(med[f] - veh[v][f])) # assemble the median absolute value per gene mad = {} for k, v in ad.items(): r = statistics.median(v) if 0 < r < 0.1: r = 0.1 mad[k] = r # using the above progress though test and poscon wells # to calculate sample zscores tw = list(h[(h['batch'] == b) & (h['type'] == 'test')].index.values) pw = list(h[(h['batch'] == b) & (h['type'] == 'poscon')].index.values) wells = tw + pw for w in df[wells].columns: for feat in df.index: if my_mad is not None and mad[feat] < my_mad: zs = (df[w][feat] - med[feat]) / (my_mad * 1.486) elif mad[feat] == 0: zs = 0 else: zs = (df[w][feat] - med[feat]) / (mad[feat] * 1.486) zsd[w][feat] = '{0:.3f}'.format(zs) # transform into dataframe, set index, null nonsense zsdf = pd.DataFrame(zsd) hs = h.loc[zsdf.columns] zsdf = zsdf.replace(['inf', '-inf'], np.nan).fillna('nan') if save is True: outpath = '{}_ZS.gct'.format(fpath.split('_', 1)[0]) gct.save_headergct(zsdf, hs, outpath) else: return zsdf, hs
def breakdown(df, h, cats, dic=True, genes=None): """ takes a dataframe and header and the categories to break down by 'b' batch, 'c' cell, 'n' name, 'd' dose. returns a dictionary with the key as the description and the dataframe as the value. 'w' is also supported as breakdown by well - useful for many plates with identical layout if dic is True a dictionary is returned, with a key title and dataframe value if dic is False then list is returned, of tuples with dataframe and header """ if genes is not None: genes = gt.get_genes(genes) df = df.loc[genes] if 'd' in cats: try: dose_col = [ x for x in h.columns if 'dose' in x or 'dilution' in x ][0] except IndexError: print('dose column error') else: dose_col = None vd = cll.OrderedDict() subs = [] cd = { 'c': 'cell', 'b': 'batch', 'd': dose_col, 'n': 'name', 'w': 'well', 'p': 'plate' } clist = [] for c in cats: try: clist.append(cd[c]) except IndexError: print('error, more than 3 categories') cat1 = clist[0] group1 = sorted(h[cat1].dropna().unique()) for e1 in group1: argdict = {cat1: e1} try: cat2 = clist[1] for e2 in sorted(gt.hsub(h, {cat1: e1})[cat2].dropna().unique()): argdict.update({cat2: e2}) try: cat3 = clist[2] for e3 in sorted( gt.hsub(h, { cat1: e1, cat2: e2 })[cat3].dropna().unique()): argdict.update({cat3: e3}) hdr = f'{e1}-{e2}-{e3}' if dic is True: vd.update( {hdr: gt.dosub(df, h, argdict, name=hdr)}) else: subs.append(gt.dsub(df, h, argdict, name=hdr)) except IndexError: hdr = f'{e1}-{e2}' if dic is True: vd.update({hdr: gt.dosub(df, h, argdict, name=hdr)}) else: subs.append(gt.dsub(df, h, argdict, name=hdr)) except IndexError: hdr = f'{e1}' if dic is True: vd.update({hdr: gt.dosub(df, h, argdict, name=hdr)}) else: subs.append(gt.dsub(df, h, argdict, name=hdr)) if dic is True: return vd else: return subs