def compare_samename_files(path1, path2, show=False): fl1, fl2 = gt.get_flist(path1), gt.get_flist(path2) for file in fl1: shn = gt.get_shn(file) other = os.path.join(path2, shn) if os.path.isfile(other): compare_files(file, other, show=show)
def check_data(path='dflt'): """ a better final map checker """ if path == 'dflt': path = gt.dflt_outpath(fldr_name='finaldata') flist = gt.get_flist(path, ext='.gct') maplist = gt.get_flist(path, ext='.txt') maplist.extend((gt.get_flist(path, ext='.xlsx'))) for f in flist: shn = gt.get_shn(f).split('.')[0] try: mapfile = [x for x in maplist if shn in x][0] except: print(f'error with map file {shn}') g = gct.Gct(f) g.get_headers() g.get_wells() datwells = g.wells mymap = gct.extractmap(mapfile) mapwells = gt.hsub(mymap, {'type':['vehicle', 'poscon', 'test']})['well'].values res = set(datwells) - set(mapwells) if len(res) == 0: print(f'{shn} ok, {380-len(datwells)} failed wells') else: print(f'eror with map/data {shn}, {len(datwells)}/{len(mapwells)}')
def compare_samename_files(path1, path2, show=False): """ from two folders, compare to see whether all identically named maps are identical """ fl1, fl2 = gt.get_flist(path1), gt.get_flist(path2) for file in fl1: shn = gt.get_shn(file) other = os.path.join(path2, shn) if os.path.isfile(other): compare_files(file, other, show=show)
def grab_ranks(path, feat, hilo=1, t=40): """ survey folder for the wells in whic given gene istop ranked wells of given feat by sorted z score). generates overall list ofdefault rank output is in descending order, highest zscore = 1 hilo: 1 = high, upregulated genes (default rank order) 0 = low, downnregulated genes """ outpath = os.path.join(path, '_rank_summary.txt') flist = gt.get_flist(path, 'ranks.gct') # set dummy starting point for low rank lowest = 500 # create blank template dataframe summary = pd.DataFrame() for f in flist: d, h = gct.extractgct(f) # flip rank order as needed if hilo > 1: d = 978 - d # get column ids for ranks below threshold wells = d.columns[d.ix[feat] < t] # extract portion of dataframe ranks = d.ix[feat, wells] ranks = pd.DataFrame(ranks) # assign plate column to each well id entry, re-order cols ranks['plate'] = gt.get_shn(f).split('-')[0] # concat portion to overall dataframe summary = pd.concat([summary, ranks]) # check and store the lowest rank newlow = min(d.ix[feat]) if newlow < lowest: lowest = newlow # re-shuffle the column order summary['well'] = summary.index.values summary = summary[['plate', 'well', feat]] print('\n', feat, int(lowest)) summary.to_csv(outpath, sep='\t', index=None)
def extractgct(pathlist, split=True): """ automatically extract and concat dataframe and header files CARE MUST BE TAKEN THE FILES ARE OF THE SAME HEADER/MAP TYPE! the break argument will parse a single string of run-on gct paths and separate into a list of separate paths""" pathlist = gt.splitpaths(pathlist, ext='.gct') if not isinstance(pathlist, list): pathlist = [pathlist] if len(pathlist) == 1: if os.path.isdir(pathlist[0]): print(f'directory, getting all gcts') pathlist = gt.get_flist(pathlist[0], ext='.gct') else: d, h = builddframegct(pathlist[0]) return d, h dlist, hlist = [], [] for path in pathlist: print(path) d, h = builddframegct(path) dlist.append(d) hlist.append(h) d = pd.concat(dlist, axis=1) h = pd.concat(hlist, axis=0) print('samples (d/h): ', len(d.columns), len(h.index)) d.name = dlist[0].name + '+' return d, h
def check_final(path='dflt'): """ check numbers of row/columns, number of fails and decimal places of final data files """ if path == 'dflt': path = gt.dflt_outpath(fldr_name='finaldata') f_list = gt.get_flist(path, ext='.gct') for file in f_list: g = gct.Gct(file) g.get_headers() try: txt = g.file.split('.')[0] + '.txt' except: try: txt = g.file.split('.')[0] + '.xlsx' except: pass try: print(sub_check_failed(g, txt)) fails, fail_result = sub_check_failed(g, txt) result = sub_check_lines(g) and sub_check_columns(g) and fail_result dplaces = sub_check_decimal(g) except FileNotFoundError: result = False fails = 'no map!!' print('{} - {} - {} failed wells - {} dplaces'.format(g.shortname, result, fails, dplaces))
def openmap(path, ext='all'): """ bulk map opening, flexible by type, but watch out for mismatched dimensions of different maps """ if isinstance(ext, str) and ext == 'all': #exts = ['.gct', '.txt', '.xlsx', '.xls'] exts = ['.gct', '.txt', '.xlsx'] pathlist = [] if os.path.isdir(path): for extsn in exts: pathlist.extend(gt.get_flist(path, ext=extsn)) else: pathlist = path for extsn in exts: pathlist = gt.splitpaths(pathlist, ext=extsn) if isinstance(pathlist, str): print('only one map') combined = extractmap(path) return combined else: combined = [] plates = [os.path.basename(x) for x in pathlist] print(plates) for file in pathlist: combined.append(extractmap(file)) combined = pd.concat(combined, axis=0, sort=False) if 'plate' not in combined.columns: combined['plate'] = combined.index combined.plate = combined.plate.apply( lambda x: x.split(':')[0]) return combined
def summary_v0(): try: path = sys.argv[1] except: sys.exit('arg1 is code directory to summarize') opath = '/Users/WRB/Desktop/codesummary.txt' results = [] flist = gt.get_flist(path, '.py') for file in flist: fn = file.split('/')[-1] if fn[0] == '_': continue with open(file, 'rU') as df: results.append(fn) a = 'de' b = 'f ' c = a + b for line in df: if c in line and 'main' not in line: line = line.replace(c, '\t').rstrip().rstrip(':') results.append(line) with open(opath, 'w') as of: for entry in results: print(entry, file=of)
def gather_wells(path, wellfile): # reads from a text file ('MKD031:A03' one or many per line) to assemble a dataframe # from a collection of wells from a range of plates. dir points to gct directory fl = gt.get_flist(path, '.gct') data, h = pd.DataFrame(), pd.DataFrame() with open(wellfile, 'r') as f: for line in f: plates = list( set([p.split(':')[0] for p in line.strip().split(',')])) wells = line.strip().split(',') # if only one plate, grab all wells from it if len(plates) <= 1: file = [s for s in fl if plates[0] in s] if len(file) > 1: print('more than one plate match', file) d, sh = dfsubset(file[0], wells) d = d.sort_index() data = pd.concat([data, d], axis=1) h = pd.concat([h, sh], axis=0) # if multiple plates, just grab each well one at a time elif len(plates) > 1: for w in wells: p = w.split(':')[0] file = [s for s in fl if p in s] if len(file) > 1: print('more than one plate match', file) d, sh = dfsubset(file[0], w) d = d.sort_index() data = pd.concat([data, d], axis=1) h = pd.concat([h, sh], axis=0) return data, h
def convert_to_txt(path): fl = gt.get_flist(path, '.xlsx') opath = '/Users/WRB/Desktop/newmaps/' for f in fl: m = pd.read_excel(f) shn = gt.get_shn(f).strip('.xlsx') outpath = os.path.join(opath, shn + '.txt') m.to_csv(outpath, sep='\t', index=False)
def run_granularity(path): flist = gt.get_flist(path, '.gct') c = cll.Counter() for f in flist: d, h = gct.extractgct(f) survey_granularity(d, c) c = pd.Series(c, name='count') c.sort_values(ascending=False, inplace=True) c = c[c > 1] c.to_excel(os.path.join(path, 'counter.xlsx'))
def list_fails(path): fl = gt.get_flist(path) for f in fl: if f.endswith('.txt'): m = pd.read_table(f, index_col=False) elif f.endswith('.xlsx'): m = pd.read_excel(f) fw = m[m['type'] == 'failed']['well'].values shn = gt.get_shn(f) print(shn, fw)
def rank_files(path): """ reads a zscore gct file (probably consensus) and saves file of ranks of each gene within each sample - from highest to lowest values""" flist = gt.get_flist(path, '.gct') for f in flist: g = gct.Gct(f) print(f) d = g.build_dframe() ds = d.rank(ascending=False) outpath = os.path.join( os.path.split(path)[0], g.shortname + '-ranks.gct') gct.save_headergct(ds, outpath, g.fh)
def summarize_csvs(path): """ provide path containing csv files to generate output summarizing levels 1 and 10 for the plate as well as the posamp and ref """ if path is None: path = gt.dflt_outpath(fldr_name='csv') results = cll.defaultdict(dict) f_list = gt.get_flist(path, '.csv') for file in f_list: try: c = Gcsv(file) d = c.build_dframe() results[c.shortname]['plate-L10'] = d['Analyte 10'].mean(axis=0) results[c.shortname]['Pos-L10'] = d.ix['B1']['Analyte 10'] results[c.shortname]['Ref-L10'] = d.ix[['A2', 'B2' ]]['Analyte 10'].mean() results[c.shortname]['plate-L1'] = d['Analyte 1'].mean(axis=0) except: print('error with ' + file) res = pd.DataFrame(results) res = res.T outpath = os.path.join(path, 'csv_summary.txt') res.to_csv(outpath, sep='\t', float_format='%.0f')
def bulk_open_as_gct(path, drop_inv=False): # no current support for repeat wells files flist = gt.get_flist(path, '.csv') pdict = cll.defaultdict(list) for f in flist: shn = gt.get_shn(f) pdict[shn].append(f) for k, v in pdict.items(): bsets = [x.split('_')[1] for x in v] if 'DP52' in bsets and 'DP53' in bsets and len(bsets) < 3: print(k, ' ok') df1 = open_as_gct(v[0], log=True) df2 = open_as_gct(v[1], log=True) df = pd.concat([df1, df2], axis=0) df.sort_index(inplace=True) if drop_inv is True: df = df[~df.index.str.contains('INV')] else: print('watch out, invariant genes included') outpath = os.path.join(path, k + '_pkexp.gct') gct.save_simplegct(df, outpath) else: print('error with: ', k)
def dl_data(sc='q', src='dflt', dest='dflt', search=None, excl=None, ext=None, p=False): """ download data from s3. 'sc' is shortcut, can be 'q' for qc, 'g' for gct, 'z' for zscore, 'e' for enrichment, 'f' for final """ if dest == 'dflt': dest = gt.dflt_outpath(fldr_name='newQC') elif dest == 'foo': dest = gt.dflt_outpath(fldr_name='foo') else: if '/' in dest or '\\' in dest: try: os.mkdir(dest) except: pass else: dest = gt.dflt_outpath(fldr_name=dest) tempdest = gt.dflt_outpath(fldr_name='temp_copy_transfer') s3c = boto3.client('s3') pref = '_review/' if src == 'dflt': items = s3c.list_objects(Bucket='genometry', Prefix=pref, Delimiter='/') folds = sorted([list(x.values())[0].replace(pref, '').strip('/') for x in items['CommonPrefixes']]) if len(folds) == 0: print('hm, zero files in list') if len(folds) == 1: fold = folds[0] if len(folds) > 1: fold = folds[-1] print('downloading from ', fold) src = 's3://genometry/' + pref + fold # grab PCA ppt and latest coordinates txt s3c.download_file('genometry', 'PCA_analysis/PCA2.pptx', os.path.join(dest, 'PCA.pptx')) coords = s3c.list_objects(Bucket='genometry', Prefix='PCA_analysis/') coord = sorted([x['Key'] for x in coords['Contents']])[-1] s3c.download_file('genometry', coord, os.path.join(tempdest, 'PCA_coords.txt')) else: src = 's3://genometry/' + src search_args = [] # parse shortcut if 'q' in sc: search_args.append(('*_qc/*', '')) if 'g' in sc: search_args.append((['*_fullqnorm_*', '*_QNORM_sorted*', '*_ref_n*', '*_Qctrl_n*'], '')) ext = '.gct' if 'z' in sc: search_args.append(('*_ZSVCQNORM_*', '')) ext = '.gct' if 'e' in sc: search_args.append(('*_escore/*', '*.gct')) if 'f' in sc: search_args.append(('*_final/*', '')) dest = gt.dflt_outpath(fldr_name='finaldata') ext = ['.gct', '.txt'] if 'i' in sc: search_args.append(('*_ZSVCINF_*', '')) ext = '.gct' search_args.append(('', excl)) if search is not None: if '*' not in search: search = '*' + search + '*' search_args.append((search, '')) if excl is not None: if '*' not in excl: excl = '*' + excl + '*' search_args.append(('', excl)) for search, excl in search_args: cmd_str = f'aws s3 cp --recursive {src} {tempdest} --exclude "*"' if isinstance(search, str): search = [search] for st in search: cmd_str += f' --include {st}' if excl != '': cmd_str += f' --exclude {excl}' print(cmd_str) subprocess.run(cmd_str, shell=True) if ext is not None: if isinstance(ext, str): ext = [ext] for ex in ext: fl = gt.globit(tempdest, f'*{ex}') for f in fl: file_dest = os.path.join(tempdest, os.path.basename(f)) shutil.move(f, file_dest) subdirs = [x[0] for x in os.walk(tempdest)][1:] for sd in subdirs: try: shutil.rmtree(sd) except: pass # do final copy from temp to main destination, erase temp for f in gt.get_flist(tempdest): file_dest = os.path.join(dest, os.path.basename(f)) shutil.move(f, file_dest) shutil.rmtree(tempdest) if p is True: try: process_qc() except: print('error processing qc')
def check_maps(path, compare=True, img=True, v=True, filt=True): """ looks through .txt and .xlsx maps in a directory and summarizes their content and relationship with each other, as well as generating plate visualizations of type and batch for each plate. V for verbose, lists names and doses per plate if filter is true, just observe 6character name excel files, otherwise consider all files""" # plot_fields = ['type', 'batch'] plot_fields = ['type'] # add checks to add batch and dose if present pert_dict, map_list = {}, {} wellpert_dict = {} flist = gt.get_flist(path, ext='.xlsx') if filt is True: flist = [x for x in flist if len(os.path.split(x)[-1]) == 11] if len(flist) == 0: flist = gt.get_flist(path, ext='.txt') flist = [x for x in flist if len(os.path.split(x)[-1]) == 10] if v is True: print('flist = ', flist) awells = gt.get_awells() composition = pd.DataFrame(columns=[ 'wells #', 'test #', 'doses', 'dose/trt', '# names', 'vehicle #', 'poscon #', 'poscons' ]) for f in flist: pname = gt.get_shn(f).split('.')[0] print(pname) if f.endswith('.xlsx'): m = pd.read_excel(f) elif f.endswith('.txt'): m = pd.read_table(f, index_col=False) m.sort_index(inplace=True) batches = m['batch'].dropna().unique() if any([('dose' in x) or ('dilution' in x) for x in m.columns]): dose_field = [ x for x in m.columns if (('dose' in x) or ('dilution' in x)) ][0] else: dose_field = None headers = { 'wells #': lambda x: len(x.index), 'test #': lambda x: ctup(x, {'type': 'test'}, 'well') } if dose_field is not None: headers.update({ 'doses': lambda x: ctup(x, {'type': 'test'}, dose_field), 'dose/trt': lambda x: gt.hsub(m, {'type': 'test'})[dose_field].groupby(m[ 'name']).unique().apply(lambda x: len(x)).mean() }) elif dose_field is None: headers.update({'doses': 'na', 'dose/trt': 'na'}) headers.update({ '# names': lambda x: ctup(x, {'type': 'test'}, 'name'), 'vehicle #': lambda x: ctup(x, {'type': 'vehicle'}, 'well'), 'poscon #': lambda x: ctup(x, {'type': 'poscon'}, 'well'), 'poscons': lambda x: ctup(x, {'type': 'poscon'}, 'name', lst=True) }) summary = pd.DataFrame(columns=headers) # check wells for full plate well_result = set(awells) - set(m['well'].values) if len(well_result) != 0: print('{} wells error, {} entries'.format(pname, len(m.index))) if v is True: print(gt.hsub(m, {'type': 'test'})['name'].dropna().unique()) try: doselist = gt.hsub( m, {'type': 'test'})[dose_field].dropna().unique() print(doselist) except: print('error with dose col, ', dose_field) pass # summarize the header info per batch, and assemble pert-lists # for the overlap comparisons for b in batches: entry = pname + '-' + b ms = gt.hsub(m, {'batch': b}) # gather pert names for overlap comparison pert_dict[entry] = ctup(m, { 'batch': b, 'type': 'test' }, 'name', lst=True) # get the well-pert identities for same plate comparison ms.loc[:, 'addr'] = ms['well'] + '-' + ms['name'].apply( lambda x: str(x)) wellpert_dict[entry] = ms['addr'].values for k in headers.keys(): try: summary.loc[entry, k] = headers[k](ms) except (KeyError, TypeError): summary.loc[entry, k] = 'na' composition = pd.concat([composition, summary]) if img is True: for pf in plot_fields: plot_series = m[pf] if len(plot_series.dropna().unique()) > 1: plot_series.name = pname + ' ' + pf plate_map_vis(plot_series, path=path) composition.to_excel(os.path.join(path, 'batch_composition.xlsx')) if compare is True: same_plates = gt.overlap_matrix(wellpert_dict.values(), wellpert_dict.keys()) name_overlap = gt.overlap_matrix(pert_dict.values(), pert_dict.keys()) name_overlap.to_excel(os.path.join(path, 'name_overlaps.xlsx')) same_plates.to_excel(os.path.join(path, 'well-name_overlaps.xlsx'))