def delete_inf(path): filelist = gt.globit(path, '*INF*') folderlist = gt.globit(path, '*_final*') folderlist = [x for x in folderlist if 'finalqc' not in x] for f in filelist: print(f) os.remove(f) for f in folderlist: print(f) shutil.rmtree(f)
def predict_cells(input, save=False): """ can accept directory and loop through files or one dataframe at a time, uses v1.0 of the SVM classifier to consolidate reps to consensus and return prediction when save is True a dataframe will be saveh""" with open('/Users/WRB/Dropbox/bin/python/celllineclassifier.p', 'rb') as file: clf = pickle.load(file) if isinstance(input, str): if os.path.isdir(input): vlist = gt.globit(input, '*_Qctrl_n*') if len(vlist) == 0: vlist = gt.globit(input, '*QNORM*') else: vlist = [input] elif isinstance(input, pd.Series): try: res = clf.predict([input])[0] except: print('error with series prediction') res = None return res else: vlist = input res_table = pd.DataFrame() for f in vlist: try: d, h = gct.extractgct(f) except: vlist[0] = d vlist[1] = h ds, hs = gt.dsub(d, h, {'type': 'vehicle'}) if len(ds) == 0: print('error, maybe using ZS file? use QNORM instead') return None for b in hs.batch.unique(): dsb, hsb = gt.dsub(ds, hs, {'batch': b}) med = dsb.median(axis=1).values shn = gt.get_shn(f) + '-' + b res = clf.predict([med])[0] res_table.loc[shn, 'cell'] = res print(f'{shn} - {res}') if save is True: res_table.to_csv(gt.dflt_outpath(fn='cell_predictions.csv'), sep='\t') return res_table
def get_vehicle_matrix(path='dflt', batch='all', delim=':', getcells=False): """" for the path load all files and collapse vehicles, plot matrix batches can be all or 'A' only to just take the first one. getcells will re-predict cells """ path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC')) flv = gt.globit(path, '*Qctrl*') if len(flv) == 0: flv = gt.globit(path, '*_QNORM_*') # should put in a check to extract from regular qnorms dlist, hlist = [], [] for f in flv: d, h = gct.extractgct(f) h['plate'] = h['plate'].apply(lambda x: x[:6]) d, h = gt.dsub(d, h, {'type': 'vehicle'}) if batch == 'all': for b in h.batch.unique(): ds, hs = gt.dsub(d, h, {'batch': b}) med = ds.median(axis=1) hs = gt.gen_label(hs, 'pb', delim=delim) dlist.append(med) hlist.append(hs.iloc[0]) elif batch == 'A': ds, hs = gt.dsub(d, h, {'batch': 'A'}) med = ds.median(axis=1) hs = gt.gen_label(hs, 'pb', delim=delim) dlist.append(med) hlist.append(hs.iloc[0]) else: med = d.median(axis=1) hs = gt.gen_label(hs, 'p', delim=delim) dlist.append(med) hlist.append(hs.iloc[0]) vdf = pd.concat(dlist, axis=1) vh = pd.DataFrame(hlist) vdf.columns = vh.label if getcells is True: vh['cell2'] = vh.label.apply(lambda x: predict_cells(vdf[x])) vh['label'] = vh.label + delim + vh.cell2 vdf.columns = vh.label return vdf, vh
def assemble_ref_dat(path): """ to gather together all reference RNA wells within the given path """ fl = gt.globit(path, '*_ref_n*') dl, hl = [], [] for f in fl: dr, hr = gct.extractgct(f) dr, hr = gt.dsub(dr, hr, {'well':['A02','B02']}) dr = round(dr, 2) dl.append(dr) hl.append(hr) alldata = pd.concat(dl, axis=1) return alldata
def run_plate_analysis(mode='ind', cats='nd', path='dflt'): """ runs standard analysis on either each plate individually 'ind' or all togegther 'comb' most useful for plates with doses. the default loc default path will be newQC on the desktop """ path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC')) fl = gt.globit(path, '*ZSVCQNORM*') print(fl) if mode == 'comb': dl, hl = [], [] for i, f in enumerate(fl): d, h = gct.extractgct(f) if i == 0: try: pname = d.name + '+' except: pname = h.addr[0].split(':')[0] + '+' if len(h.batch.unique()) > 1: # fix sample labels for plate/batch h.plate = h.plate + h.batch # define labels (should I add plate?) h = gt.gen_label(h, cats) dl.append(d) hl.append(h) try: d = pd.concat(dl, axis=1) d.name = pname except ValueError: sys.exit('no gct file plates to analyze') h = pd.concat(hl, axis=0) analyze_plate(d, h, cats) elif mode == 'ind': for f in fl: d, h = gct.extractgct(f) # define labels (should I add plate?) h = gt.gen_label(h, cats) analyze_plate(d, h, cats)
def distribute_qc(path = 'dflt'): if path is 'dflt': inpath = gt.dflt_outpath(fldr_name='newQC') outpath = gt.dflt_outpath(fldr_name='QCprocessing') folders = ['calibs', 'flogps', 'escore', 'cellid-nolabel', 'cellid-label', 'euclidean'] folders = [os.path.join(outpath, x) for x in folders] srch_terms = ['finalqc/*calibplot', 'finalqc/*FLOGP', 'escore_summary*/', '*cell_line/*cellid_nolabel/*-*cellid_circle', '*cell_line/*-*cellid_circle', '-*euclidean'] for term, fold in zip(srch_terms, folders): try: os.makedirs(fold) except OSError as e: if e.errno != errno.EEXIST: raise srch = '*'.join(['', term, '' ]) + '.png' for file in gt.globit(inpath, srch): shutil.copy(file, fold)
def combine_fails(path='dflt', ret=False, summ=False, sep=False, thresh=1): if path == 'dflt': path = gt.dflt_outpath(fldr_name='newQC') fl = gt.globit(path, '*QC_fail*') files = ' '.join(fl) #cmd_str = 'cat ' + files + ' > ' + os.path.join(path, 'QC_fail.txt') #subprocess.run(cmd_str, shell=True) datlist = [] for f in fl: dat = pd.read_csv(f, sep='\t', skiprows=1) dropcols = [x for x in dat.columns if 'Unnamed' in x] dat = dat.drop(dropcols, axis=1) dat.dropna(inplace=True) try: dat = dat[dat['Batch'] != ' '] except: pass if sep == False: try: dat = dat[dat['Batch'] != 'Batch'] except: pass datlist. append(dat) data = pd.concat(datlist, axis=0) data.to_csv(os.path.join(path, 'QCfail_summary.txt'), sep='\t') if summ is True: gbname = data.groupby('PERT_DESC').size() print(gbname[gbname > thresh]) gbbatch = data.groupby('Batch').size() print(gbbatch[gbbatch > thresh]) # this subsets down to show how many doses totally fail (3 reps each) per name # g = f.groupby(['PERT_DESC', 'DOSE']).size() # res = g[g > 2].groupby('PERT_DESC').size().sort_values(ascending=False) if ret is True: return data
def dl_data(sc='q', src='dflt', dest='dflt', search=None, excl=None, ext=None, p=False): """ download data from s3. 'sc' is shortcut, can be 'q' for qc, 'g' for gct, 'z' for zscore, 'e' for enrichment, 'f' for final """ if dest == 'dflt': dest = gt.dflt_outpath(fldr_name='newQC') elif dest == 'foo': dest = gt.dflt_outpath(fldr_name='foo') else: if '/' in dest or '\\' in dest: try: os.mkdir(dest) except: pass else: dest = gt.dflt_outpath(fldr_name=dest) tempdest = gt.dflt_outpath(fldr_name='temp_copy_transfer') s3c = boto3.client('s3') pref = '_review/' if src == 'dflt': items = s3c.list_objects(Bucket='genometry', Prefix=pref, Delimiter='/') folds = sorted([list(x.values())[0].replace(pref, '').strip('/') for x in items['CommonPrefixes']]) if len(folds) == 0: print('hm, zero files in list') if len(folds) == 1: fold = folds[0] if len(folds) > 1: fold = folds[-1] print('downloading from ', fold) src = 's3://genometry/' + pref + fold # grab PCA ppt and latest coordinates txt s3c.download_file('genometry', 'PCA_analysis/PCA2.pptx', os.path.join(dest, 'PCA.pptx')) coords = s3c.list_objects(Bucket='genometry', Prefix='PCA_analysis/') coord = sorted([x['Key'] for x in coords['Contents']])[-1] s3c.download_file('genometry', coord, os.path.join(tempdest, 'PCA_coords.txt')) else: src = 's3://genometry/' + src search_args = [] # parse shortcut if 'q' in sc: search_args.append(('*_qc/*', '')) if 'g' in sc: search_args.append((['*_fullqnorm_*', '*_QNORM_sorted*', '*_ref_n*', '*_Qctrl_n*'], '')) ext = '.gct' if 'z' in sc: search_args.append(('*_ZSVCQNORM_*', '')) ext = '.gct' if 'e' in sc: search_args.append(('*_escore/*', '*.gct')) if 'f' in sc: search_args.append(('*_final/*', '')) dest = gt.dflt_outpath(fldr_name='finaldata') ext = ['.gct', '.txt'] if 'i' in sc: search_args.append(('*_ZSVCINF_*', '')) ext = '.gct' search_args.append(('', excl)) if search is not None: if '*' not in search: search = '*' + search + '*' search_args.append((search, '')) if excl is not None: if '*' not in excl: excl = '*' + excl + '*' search_args.append(('', excl)) for search, excl in search_args: cmd_str = f'aws s3 cp --recursive {src} {tempdest} --exclude "*"' if isinstance(search, str): search = [search] for st in search: cmd_str += f' --include {st}' if excl != '': cmd_str += f' --exclude {excl}' print(cmd_str) subprocess.run(cmd_str, shell=True) if ext is not None: if isinstance(ext, str): ext = [ext] for ex in ext: fl = gt.globit(tempdest, f'*{ex}') for f in fl: file_dest = os.path.join(tempdest, os.path.basename(f)) shutil.move(f, file_dest) subdirs = [x[0] for x in os.walk(tempdest)][1:] for sd in subdirs: try: shutil.rmtree(sd) except: pass # do final copy from temp to main destination, erase temp for f in gt.get_flist(tempdest): file_dest = os.path.join(dest, os.path.basename(f)) shutil.move(f, file_dest) shutil.rmtree(tempdest) if p is True: try: process_qc() except: print('error processing qc')