def analyze_query(args, work_dir): ''' Analyze the output from query_tool - find self-connections and create graphs ''' #make a gct object db = gct.GCT() db.read(args.res) ##load query result - gctx file rslt = gct.GCT() #if specific result directory is specified, use that - otherwise get gctx from working dir #try: #args.resultDir #except NameError: #outGctx = glob.glob(os.path.join(work_dir, '*COMBINED*.gctx')) #select combined result gctx in working dir created from build_query step ##rslt.read(outGctx[0]) #print 'read gct from working dir' #else: #print args.resultDir #print args.result ##rslt.read(args.resultDir) #print 'read gct from explicitly stated result dir' if args.result: outGctx = glob.glob( os.path.join(work_dir, '*COMBINED*.gctx') ) #select combined result gctx in working dir created from build_query step #rslt.read(outGctx[0]) print 'read gct from working dir' print args.result else: print args.resultDir #print args.result #rslt.read(args.resultDir) print 'read gct from explicitly stated result dir'
def load_summly_independent(iGold, mtrxSummly): "load dos compounds that have independent mode results - return dataframe" IST = gct.GCT(mtrxSummly) IST.read(col_inds=list(iGold.values)) inSum = IST.frame gctSigs = IST.get_column_meta('sig_id') gctPertIDs = IST.get_column_meta('pert_id') # inSum.columns = gctSigs #index is just sig_id # hierarchical index - sig_id and pert_id iZip = zip(*[gctPertIDs, gctSigs]) mCol = pd.MultiIndex.from_tuples(iZip, names=['pert_id', 'sig_id']) inSum.columns = mCol #read all non-dos summlies gt = gct.GCT() gt.read_gctx_col_meta(mtrxSummly) gt.read_gctx_row_meta(mtrxSummly) indSummSigs = gt.get_column_meta('sig_id') iNonDos = np.arange(len(indSummSigs)) iNonDos = np.delete(iNonDos, iGold.values) # read in non-dos results ISO = gct.GCT(mtrxSummly) ISO.read(col_inds=list(iNonDos)) outSum = ISO.frame gctSigs = ISO.get_column_meta('sig_id') gctPertIDs = ISO.get_column_meta('pert_id') # outSum.columns = gctSigs iZip = zip(*[gctPertIDs, gctSigs]) mCol = pd.MultiIndex.from_tuples(iZip, names=['pert_id', 'sig_id']) outSum.columns = mCol return inSum, outSum #return dataframe of rankpt values
def get_summly_ind_compounds(dosGold, mtrxSummly): '''1) return non-DOS sig_ids of compounds that are in summly space \ summly matrix Returns: pandas series - index = sig_ids, values = indices in summly matrix ''' gt = gct.GCT() gt.read_gctx_col_meta(mtrxSummly) gt.read_gctx_row_meta(mtrxSummly) indSummSigs = gt.get_column_meta('sig_id') indSummPType = gt.get_column_meta('pert_type') indSummInames = gt.get_column_meta('pert_iname') # sigSer = pd.Series(index=indSummSigs, data=indSummInames) typeSer = pd.Series(index=indSummSigs, data=indSummPType) isCp = typeSer[typeSer == 'trt_cp'] #which dos cps are in the summly indpend summBrds = set(isCp.index) goldDosBrds = set(dosGold['sig_id'].values) summGold = summBrds.difference(goldDosBrds) indSummSer = pd.Series(indSummSigs) indSer = pd.Series(index=indSummSer.values, data=indSummSer.index) iNonDos = indSer[indSer.index.isin(summGold)] # iNonDos = pd.Series(list(summGold)) return iNonDos
def main(): GTEx_gctobj = gct.GCT(GTEx_GCTX) GTEx_gctobj.read() GTEx_genes = map(lambda x: x.split('.')[0], GTEx_gctobj.get_rids()) lm_id = [] infile = open(BGEDV2_LM_ID) for line in infile: ID = line.strip('\n').split('\t')[0] lm_id.append(ID) infile.close() lm_idx = map(GTEx_genes.index, lm_id) tg_id = [] infile = open(BGEDV2_TG_ID) for line in infile: ID = line.strip('\n').split('\t')[0] tg_id.append(ID) infile.close() tg_idx = map(GTEx_genes.index, tg_id) genes_idx = lm_idx + tg_idx data = GTEx_gctobj.matrix[genes_idx, :].astype('float64') np.save('GTEx_float64.npy', data)
def build_html(work_dir): ''' builds summary html files from templates ''' # instantiate a progress object # prog = progress.DeterminateProgressBar('HTML report') # grab the cids from the file. Find all of # the unique perts rpt_dict = tool_ops.parse_rpt(glob.glob(work_dir + '/*.rpt')[0]) gcto = gct.GCT(rpt_dict['res']) gcto.read() cids = gcto.get_gctx_cid() pert_descs = gcto.get_column_meta('pert_desc') perts = [x.split(':')[1] for x in cids] pert_desc_dict = dict(zip(perts,pert_descs)) unique_perts = list(set(perts)) unique_perts.sort() # buld an environment for jinja2 cmap_base_dir = '/'.join(os.path.dirname(cmap.__file__).split('/')[0:-1]) env = jinja2.Environment(loader=jinja2.FileSystemLoader(cmap_base_dir + '/templates')) # build an index page index_page_template = env.get_template('Link_List_Template.html') index_links = [pert_desc_dict[x] + '_detail.html' for x in unique_perts] with open(os.path.join(work_dir,'index.html'),'w') as f: f.write(index_page_template.render(title='Dose Analysis Results', links=index_links, labels=unique_perts)) # for each unique_pert, make a detail page dose_response_compound_summary_template = env.get_template('Dose_Response_Compound_Summary_Template.html') for unique_pert in unique_perts: query_images = glob.glob(os.path.join(work_dir,pert_desc_dict[unique_pert] + '*query_rank.png')) doses = [float(os.path.basename(x).split('_')[1]) for x in query_images] tmp_tup = zip(doses,query_images) tmp_tup.sort() doses,query_images = zip(*tmp_tup) if unique_pert != 'DMSO': qq_images = glob.glob(os.path.join(work_dir,pert_desc_dict[unique_pert] + '*um_internal-external_qq.png')) doses = [float(os.path.basename(x).split('_')[1].rstrip('um')) for x in qq_images] tmp_tup = zip(doses,qq_images) tmp_tup.sort() doses,qq_images = zip(*tmp_tup) with open(os.path.join(work_dir,pert_desc_dict[unique_pert] + '_detail.html'),'w') as f: f.write(dose_response_compound_summary_template.render( title=pert_desc_dict[unique_pert], query_images=query_images, qq_images=qq_images)) else: with open(os.path.join(work_dir,pert_desc_dict[unique_pert] + '_detail.html'),'w') as f: f.write(dose_response_compound_summary_template.render( title=pert_desc_dict[unique_pert], query_images=query_images, qq_images=[]))
def gct2gctx(filepath): g = gct.GCT() try: print "Reading..." g._read_gct(filepath) print "Writing..." g.write(filepath.replace('.gct','.gctx'), mode='gctx') except: print "ERROR: could not process",filepath
def main(): infile = sys.argv[1] outfile = sys.argv[2] gctobj = gct.GCT(infile) gctobj.read() data = gctobj.matrix[:, :].astype('float64') np.save(outfile, data)
def get_summly_dos_indeces(dosBrds, mtrxSummly): "1) return dos compounds that are in summly matched space " gt = gct.GCT() gt.read(mtrxSummly) # indSummSigs = gt.get_column_meta('sig_id') # indSummInames = gt.get_column_meta('pert_iname') summFrm = gt.frame sigSer = pd.Series(index=summFrm.index, data=summFrm.columns) dosSer = sigSer[sigSer.index.isin(dosBrds)] return dosSer
def load_file(filename): ''' load the gct file using cmap/Zichen python script ''' import cmap.io.gct as gct import cmap.io.plategrp as grp GCTObject = gct.GCT(filename) GCTObject.read(verbose=False) return GCTObject
def build_query(args, work_dir): ''' build query results ''' #make signature for each dose fup = os.path.join(work_dir, 'up_list.gmt') fdn = os.path.join(work_dir, 'dn_list.gmt') open(fup, 'w') #overwrite existing grp file open(fdn, 'w') #overwrite existing grp file n_edge = 50 db = gct.GCT() #db.read(gctfile) db.read(args.res) cids = db.get_cids() pertIDs = [x.split(':')[1] for x in cids] doses = [float(x.split(':')[2]) for x in cids] perts = db.get_column_meta('pert_desc') probes = db.get_rids() cellLs = db.get_column_meta('cell_id') timePs = db.get_column_meta('pert_time') mtrx = db.matrix #matrix of data from gct file #loop through each column of data for i, pertID in enumerate(pertIDs): profile = mtrx[:, i] n_prof = len(profile) iprofile = profile.argsort() #indices that sort array iprofile = iprofile[::-1] #switch indicies to decend sprofile = profile[iprofile] itop = iprofile[0:(n_edge)] ibot = iprofile[-n_edge:n_prof] col_name = perts[i] + '_' + str( doses[i]) + 'um_' + cellLs[i] + '_' + timePs[i] ptop = [] pbot = [] for j, it in enumerate(itop): ptop.append(probes[it]) #make probe id list for j, ip in enumerate(ibot): pbot.append(probes[ip]) #make probe id list #write to gmt list with open(fup, 'a') as f: f.write(col_name + '\t' + col_name + '\t') for pt in ptop: f.write(pt + '\t') f.write('\n') with open(fdn, 'a') as f: f.write(col_name + '\t' + col_name + '\t') for pb in pbot: f.write(pb + '\t') f.write('\n') #python system call os.chdir(work_dir) #cmd = 'rum -q local query_tool --uptag ' + fup + ' --dntag ' + fdn + ' --metric eslm' cmd = 'rum -q local query_tool --uptag ' + fup + ' --dntag ' + fdn + ' --metric wteslm --mkdir false' os.system(cmd)
def build_probe_curves(args,work_dir): ''' builds dose response curves for the specified probe ''' gcto = gct.GCT() probe_ind = gcto.get_gctx_rid_inds(args.res,match_list=args.probe,exact=True) gcto.read_gctx_matrix(args.res,row_inds=probe_ind) cids = gcto.get_gctx_cid(args.res) doses = [float(x.split(':')[2]) for x in cids] CM = mu.CMapMongo() with open(os.path.join(work_dir,args.probe + '_summary.txt'),'w') as f: headers = ['pert_id','pert_desc','base_dose','base_z_score', 'best_dose','best_z_score', 'best_z_score_delta'] f.write('\t'.join(headers) + '\n') for i,unique_pert in enumerate(unique_perts): prog.update('analyzing {0}'.format(args.probe),i,num_perts) cid_inds = [i for i,x in enumerate(cids) if unique_pert in x] pert_scores = gcto.matrix[0,cid_inds] pert_doses = [doses[x] for x in cid_inds] tmp_tup = zip(pert_doses,pert_scores) tmp_tup.sort() pert_doses,pert_scores = zip(*tmp_tup) plt.plot(pert_doses,pert_scores) plt.title('::'.join([unique_pert,args.probe])) plt.xlabel('dose') plt.ylabel('z-score') plt.savefig(os.path.join(work_dir,'_'.join([unique_pert.replace(':','_'),args.probe,'dose_curve.png']))) plt.close() pert_desc = CM.find({'pert_id':unique_pert},{'pert_desc':True},limit=1) if not pert_desc: pert_desc = ['-666'] pert_desc = pert_desc[0] base_dose = pert_doses[0] base_z_score = pert_scores[0] z_delta = (numpy.array(pert_scores) + 10) - (base_z_score + 10) abs_z_delta = numpy.abs(z_delta) z_delta = z_delta.tolist() abs_z_delta = abs_z_delta.tolist() best_ind = z_delta.index(numpy.min(z_delta)) best_dose = pert_doses[best_ind] best_z_score = pert_scores[best_ind] best_z_score_delta = z_delta[best_ind] data = [unique_pert,pert_desc,str(base_dose),str(base_z_score), str(best_dose),str(best_z_score),str(best_z_score_delta)] f.write('\t'.join(data) + '\n') prog.clear()
def get_summly_dos_indeces(dosGold,mtrxSummly): "1) obtain indices for all DOS compounds in the pre computed \ summly matrix" gt = gct.GCT() gt.read_gctx_col_meta(mtrxSummly) gt.read_gctx_row_meta(mtrxSummly) indSummSigs = gt.get_column_meta('sig_id') indSummInames = gt.get_column_meta('pert_iname') sigSer = pd.Series(index=indSummSigs, data=indSummInames) #which dos cps are in the summly indpend summBrds = set(sigSer.index) goldDosBrds = set(dosGold['sig_id'].values) summGold = summBrds.intersection(goldDosBrds) # for the what is the median of the top n connections in summly independent mode? #get indices of gold DOS indSummSer = pd.Series(indSummSigs) indSer = pd.Series(index=indSummSer.values,data=indSummSer.index) iGold = indSer.reindex(list(summGold)) return iGold
def write_pairwise_mtrx(self, inames_zip, mtrx, out): ''' Write a matrix to file Parameters ---------- inames_zip : list of tuples brds paired with inames mtrx : numpy.ndarray matrix of data out : str output path - no file extension ''' Hindex = pd.MultiIndex.from_tuples(inames_zip, names=['brd', 'iname']) sumScoreFrm = pd.DataFrame(mtrx, index=Hindex, columns=Hindex) sumScoreFrm.to_csv(out + '.txt', sep='\t') gc = gct.GCT() gc.build_from_DataFrame(sumScoreFrm) gc.write(out)
def add_from_gct(self,src,ss_column_name='distil_ss', cc_column_name='distil_cc_q75'): ''' reads the meta data of the given gct or gctx file ''' #set the src for the SC object self.src = src #read in the gct data gct_obj = gct.GCT(src=src) gct_obj.read() #grab the pid, ss, and cc data as well as ss and cc cutoffs s = gct_obj.get_column_meta(ss_column_name) c = gct_obj.get_column_meta(cc_column_name) pert_descs = gct_obj.get_column_meta('pert_desc') pert_ids = gct_obj.get_column_meta('pert_id') doses = gct_obj.get_column_meta('pert_dose') id_list = gct_obj.get_column_meta('id') pert_desc_list = gct_obj.get_column_meta('pert_desc') pid = [x + '::' + pert_desc_list[i] for i,x in enumerate(id_list)] #ensure that s and c are lists not numpy arrays self.c = list(self.c) self.s = list(self.s) #convert ss and cc into float values s = [float(x) for x in s] c = [float(x) for x in c] #add pid, ss, and cc to the existing data self.pid.extend(pid) self.s.extend(s) self.c.extend(c) self.pert_ids.extend(pert_ids) self.pert_descs.extend(pert_descs) self.doses.extend(doses)
# wkdir = '/xchip/cogs/projects/NMF/TA_lung_OE_May_2014/TA_OE_qnorm' wkdir = '/xchip/cogs/projects/NMF/TA_lung_OE_June_2014/TA_OE_ZSPCINF' # wkdir = '/xchip/cogs/projects/NMF/TA_lung_OE_May_2014/TA_OE_ZSPC_LM' if not os.path.exists(wkdir): os.mkdir(wkdir) ################ ### load data ## ################ file_modz = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/TA_JUN10_COMPZ.MODZ_SCORE_n13974x22268.gctx' file_qnorm = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/TA_JUN10_QNORM_n38534x978.gctx' file_zspcinf = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/TA_JUN10_ZSPCINF_n38534x22268.gctx' gt = gct.GCT(src=file_zspcinf) gt.read() ds = gt.frame # signature subset # file_lung_grp = '/cga/meyerson/brooks/TA/all_TA_for_jun10/all_TA_Lung_sig_ids.grp' file_lung_grp = '/xchip/cga_home/brooks/TA/all_TA_for_jun10/all_TA_Lung_distil_ids.grp' lungSigs = pd.read_csv(file_lung_grp, header=None, names=['sig_id']) ds_lung = ds.reindex(columns=lungSigs.sig_id.values) # signature annotations sFile = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/inst.info' sigInfo = pd.read_csv(sFile, sep='\t') sigInfo.index = sigInfo.distil_id #####################################
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("--pred_file", dest="pred_file", type="string", help="""File containing the mutation impact predictions""", default=None) # opt_parser.add_option("--col", # dest="pred_col", # type="string", # help="""Prediciton files have predictions based on # multiple scenarios. The scenario needs to be # specified because figures will be plotted in # the order of GOF, LOF, COF,Inert, NI calls. This # specifies the name of the column that contains # the prediction. DEF=%s""" % DEF_PRED_COL, # default=DEF_PRED_COL) opt_parser.add_option( "--sig_info", dest="sig_info", type="string", help="""sig info file with gene information and distil information""", default=None) opt_parser.add_option("--gctx", dest="gctx", type="string", help="GCTX file with correlations", default=None) opt_parser.add_option( "--sig_gctx", dest="sig_gctx", type="string", help="""GCTX containing signature data. For L1000, this would the Z-score data""", default=None) opt_parser.add_option("--ref_allele_mode", dest="ref_allele_mode", action="store_true", help="""Instead of organizing plots by gene, will use the wt column to determine what are the reference alleles.""", default=False) opt_parser.add_option( "--null_conn", dest="null_conn", type="string", help="""File of null connectivity values. This file is given as output from eVIP_compare.py. The file ends with conn_null.txt""", default=None) opt_parser.add_option("--out_dir", dest="out_dir", type="string", help="Output directory to put figures", default=None) opt_parser.add_option("--ymin", dest="ymin", type="int", help="Minimum y-value of rep value. DEF=%d" % DEF_YMIN, default=DEF_YMIN) opt_parser.add_option("--ymax", dest="ymax", type="int", help="Maximum y-value of rep value. DEF=%d" % DEF_YMAX, default=DEF_YMAX) opt_parser.add_option( "--corr_val_str", dest="corr_val_str", type="string", help="String used to label the correlation value. DEF=\"%s\"" % DEF_CORR_VAL_STR, default=DEF_CORR_VAL_STR) opt_parser.add_option("--allele_col", dest="allele_col", type="string", help="""Column name that indicates the allele names. DEF=%s""" % DEF_ALLELE_COL, default=DEF_ALLELE_COL) opt_parser.add_option("--use_c_pval", dest="use_c_pval", action="store_true", help="Use corrected p-val instead of raw pval", default=False) opt_parser.add_option("--pdf", dest="pdf", action="store_true", help="Makes figures in pdf format instead of png", default=False) opt_parser.add_option( "--cell_id", dest="cell_id", type="string", help="""Indicates which cell line. Helps for filtering sig_info file""", default=None) opt_parser.add_option( "--plate_id", dest="plate_id", type="string", help="""Indicates which cell line. Helps for filtering sig_info file""", default=None) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("--pred_file") # opt_parser.check_required("--col") opt_parser.check_required("--sig_info") opt_parser.check_required("--gctx") opt_parser.check_required("--null_conn") opt_parser.check_required("--out_dir") pred_file = open(options.pred_file) pred_col = DEF_PRED_COL if os.path.exists(options.out_dir): out_dir = os.path.abspath(options.out_dir) else: os.mkdir(options.out_dir) out_dir = os.path.abspath(options.out_dir) print "Creating output directory: %s" % out_dir pdf = options.pdf use_c_pval = options.use_c_pval ymin = options.ymin ymax = options.ymax allele_col = options.allele_col ref_allele_mode = options.ref_allele_mode corr_val_str = options.corr_val_str cell_id = options.cell_id plate_id = options.plate_id sig_info = open(options.sig_info) null_conn = getNullConnDist(options.null_conn) # null_x_vals = [] # for val in null_conn: # null_x_vals.append(random.uniform(NULL_CONN_RANGE[0], NULL_CONN_RANGE[1])) this_gctx = gct.GCT(options.gctx) this_gctx.read() sig_gctx = gct.GCT(options.sig_gctx) sig_gctx.read() # Process predictions # allele2pvals = {allele:[mut vs wt pval, # wt vs mut-wt pval, # mut-wt conn pval] (gene2wt, gene2allele_call, gene2num_alleles, allele2pvals) = parse_pred_file(pred_file, pred_col, use_c_pval, ref_allele_mode) allele2distil_ids = parse_sig_info(sig_info, allele_col, cell_id, plate_id) for gene in gene2wt: this_fig = plt.figure() this_fig.set_size_inches((gene2num_alleles[gene] + 1) * 4, 4 * 3) grid_size = (4, gene2num_alleles[gene] + 1) wt_heatmap_ax = plt.subplot2grid(grid_size, (0, 0)) wt_im = plot_rep_heatmap(wt_heatmap_ax, this_gctx.frame, allele2distil_ids[gene2wt[gene]], allele2distil_ids[gene2wt[gene]], gene2wt[gene], ymin, ymax) # WT self connectivity wt_self, wt_self_row_medians = getSelfConnectivity( this_gctx, allele2distil_ids[gene2wt[gene]], len(allele2distil_ids[gene2wt[gene]])) # Create consistent x values for the wt reps when plotting wt_x_vals = [] for val in wt_self_row_medians: wt_x_vals.append(random.randint(WT_RANGE[0], WT_RANGE[1])) # Plot color bar on this axis plt.colorbar(wt_im, ax=wt_heatmap_ax, shrink=0.7) # Plot allele data col_counter = 1 for type in PRED_TYPE: for allele in gene2allele_call[gene][type]: # CREATE SCATTERPLOT FIGURE plot_signatures(pdf, out_dir, sig_gctx.frame, gene2wt[gene], allele, allele2distil_ids[gene2wt[gene]], allele2distil_ids[allele]) # PLOT HEATMAP this_hm_ax = plt.subplot2grid(grid_size, (0, col_counter)) plot_rep_heatmap(this_hm_ax, this_gctx.frame, allele2distil_ids[allele], allele2distil_ids[allele], type + " - " + allele, ymin, ymax) # PLOT WT MUT heatmap this_wt_mut_ax = plt.subplot2grid(grid_size, (1, col_counter)) plot_rep_heatmap(this_wt_mut_ax, this_gctx.frame, allele2distil_ids[gene2wt[gene]], allele2distil_ids[allele], gene2wt[gene] + " vs " + allele, ymin, ymax) # PLOT RANKPOINT ROWS this_jitter_ax = plt.subplot2grid(grid_size, (2, col_counter)) mut_self, mt_self_row_medians = getSelfConnectivity( this_gctx, allele2distil_ids[allele], len(allele2distil_ids[allele])) wt_mut, wt_mut_row_medians = getConnectivity( this_gctx, allele2distil_ids[gene2wt[gene]], allele2distil_ids[allele], len(allele2distil_ids[allele])) plot_jitter( this_jitter_ax, col_counter, wt_x_vals, wt_self_row_medians, mt_self_row_medians, wt_mut_row_medians, # null_x_vals, # null_conn, allele2pvals[allele][0], allele2pvals[allele][1], use_c_pval, ymin, ymax, corr_val_str) # Compared to random connectivity conn_ax = plt.subplot2grid(grid_size, (3, col_counter)) plot_conn(conn_ax, col_counter, null_conn, wt_mut_row_medians, allele2pvals[allele][2], use_c_pval, corr_val_str) col_counter += 1 if pdf: this_fig.savefig("%s/%s_impact_pred_plots.pdf" % (out_dir, gene), format="pdf") else: this_fig.savefig("%s/%s_impact_pred_plots.png" % (out_dir, gene)) plt.close(this_fig) sys.exit(0)
def eVIP_run_main(pred_file=None, sig_info=None, gctx=None, sig_gctx=None, ref_allele_mode=None, null_conn=None, out_dir=None, ymin=None, ymax=None, allele_col=None, use_c_pval=None, pdf=None, cell_id=None, plate_id=None, corr_val_str=None): #setting default values # ymin = int(ymin) if ymin != None else int(-100) # ymax = int(ymax) if ymax != None else int(100) ymin = int(ymin) if ymin != None else int(-1.00) ymax = int(ymax) if ymax != None else int(1.00) pred_file = open(pred_file) pred_col = DEF_PRED_COL if os.path.exists(out_dir): out_dir = os.path.abspath(out_dir) else: os.mkdir(out_dir) out_dir = os.path.abspath(out_dir) print "Creating output directory: %s" % out_dir sig_info = open(sig_info) null_conn = getNullConnDist(null_conn) this_gctx = gct.GCT(gctx) this_gctx.read() sig_gctx = gct.GCT(sig_gctx) sig_gctx.read() (gene2wt, gene2allele_call, gene2num_alleles, allele2pvals) = parse_pred_file(pred_file, pred_col, use_c_pval, ref_allele_mode) allele2distil_ids = parse_sig_info(sig_info, allele_col, cell_id, plate_id) for gene in gene2wt: this_fig = plt.figure() this_fig.set_size_inches((gene2num_alleles[gene] + 1) * 4, 4 * 3) grid_size = (4, gene2num_alleles[gene] + 1) wt_heatmap_ax = plt.subplot2grid(grid_size, (0, 0)) wt_im = plot_rep_heatmap(wt_heatmap_ax, this_gctx.frame, allele2distil_ids[gene2wt[gene]], allele2distil_ids[gene2wt[gene]], gene2wt[gene], ymin, ymax) # WT self connectivity wt_self, wt_self_row_medians = getSelfConnectivity( this_gctx, allele2distil_ids[gene2wt[gene]], len(allele2distil_ids[gene2wt[gene]])) # Create consistent x values for the wt reps when plotting wt_x_vals = [] for val in wt_self_row_medians: wt_x_vals.append(random.randint(WT_RANGE[0], WT_RANGE[1])) # Plot color bar on this axis plt.colorbar(wt_im, ax=wt_heatmap_ax, shrink=0.7) # Plot allele data col_counter = 1 for type in PRED_TYPE: for allele in gene2allele_call[gene][type]: # CREATE SCATTERPLOT FIGURE plot_signatures(pdf, out_dir, sig_gctx.frame, gene2wt[gene], allele, allele2distil_ids[gene2wt[gene]], allele2distil_ids[allele]) # PLOT HEATMAP this_hm_ax = plt.subplot2grid(grid_size, (0, col_counter)) plot_rep_heatmap(this_hm_ax, this_gctx.frame, allele2distil_ids[allele], allele2distil_ids[allele], type + " - " + allele, ymin, ymax) # PLOT WT MUT heatmap this_wt_mut_ax = plt.subplot2grid(grid_size, (1, col_counter)) plot_rep_heatmap(this_wt_mut_ax, this_gctx.frame, allele2distil_ids[gene2wt[gene]], allele2distil_ids[allele], gene2wt[gene] + " vs " + allele, ymin, ymax) # PLOT RANKPOINT ROWS this_jitter_ax = plt.subplot2grid(grid_size, (2, col_counter)) mut_self, mt_self_row_medians = getSelfConnectivity( this_gctx, allele2distil_ids[allele], len(allele2distil_ids[allele])) wt_mut, wt_mut_row_medians = getConnectivity( this_gctx, allele2distil_ids[gene2wt[gene]], allele2distil_ids[allele], len(allele2distil_ids[allele])) plot_jitter( this_jitter_ax, col_counter, wt_x_vals, wt_self_row_medians, mt_self_row_medians, wt_mut_row_medians, # null_x_vals, # null_conn, allele2pvals[allele][0], allele2pvals[allele][1], use_c_pval, ymin, ymax, corr_val_str) # Compared to random connectivity conn_ax = plt.subplot2grid(grid_size, (3, col_counter)) plot_conn(conn_ax, col_counter, null_conn, wt_mut_row_medians, allele2pvals[allele][2], use_c_pval, corr_val_str) col_counter += 1 if pdf: this_fig.savefig("%s/%s_impact_pred_plots.pdf" % (out_dir, gene), format="pdf") else: this_fig.savefig("%s/%s_impact_pred_plots.png" % (out_dir, gene)) plt.close(this_fig)
os.mkdir(wkdir) # load cliques classGMT = '/xchip/cogs/projects/pharm_class/cp_cliques_current.gmt' gmtDict = gmt.read(classGMT) cliqueLabels = pd.DataFrame(gmtDict) # create set of all clique members cList = [item for sublist in cliqueLabels['sig'] for item in sublist] cSet = set(cList) # load observed score data # thresholded # rFile = '/xchip/cogs/projects/connectivity/null/clique_analysis/dmso_q_thresholded_asym_lass_matrix/jan28/my_analysis.sig_cliqueselect_tool.2014012814320559/summly/self_rankpt_n379x379.gctx' # non-thresholded asym rFile = '/xchip/cogs/projects/connectivity/null/clique_analysis/baseline_lass_asym_matrix/jan28/my_analysis.sig_cliqueselect_tool.2014012814364180/summly/self_rankpt_n379x379.gctx' gt1 = gct.GCT() gt1.read(rFile) sFrm = gt1.frame sFrm.columns = gt1.get_column_meta('pert_id') #check that all clique members are in the observed matrix if not (sFrm.index.isin(cSet)).all(): print "not all clique data loaded" # load null dFile = '/xchip/cogs/projects/connectivity/null/dmso/lass_n1000x7147.gctx' gt = gct.GCT(dFile) gt.read() dmsoFrm = gt.frame dmsoFrm.columns = gt.get_column_meta('id') dmsoCM = dmsoFrm[dmsoFrm.index.isin(cSet)] rowMedian = dmsoCM.median(axis=1)
''' This script contains examples for reading .gctx files in Python. ''' import cmap.io.gct as gct import cmap.io.plategrp as grp # give input file path_to_gctx_file = '/cmap/tools/l1ktools/data/modzs_n272x978.gctx' # read the full data file GCTObject = gct.GCT(path_to_gctx_file) GCTObject.read() print(GCTObject.matrix) # read the first 100 rows and 10 columns of the data GCTObject = gct.GCT(path_to_gctx_file) GCTObject.read(row_inds=range(100), col_inds=range(10)) print(GCTObject.matrix) # read the first 10 columns of the data, identified by their # column ids, stored in a grp file given below path_to_column_ids = '/cmap/tools/l1ktools/data/cids_n10.grp' # read the column ids as a list column_ids = grp.read_grp(path_to_column_ids) GCTObject = gct.GCT(path_to_gctx_file) # extract only the specified columns from the matrix GCTObject.read(cid=column_ids) print(GCTObject.matrix) # get the available meta data headers for data columns and row
# keep only n instances of each compound for brd in grpedBRD.groups: sigs = grpedBRD.groups[brd] if brd == 'DMSO': keepList.extend(sigs) # keep all DMSO sigs else: keepList.extend(sigs[:nKeep]) reducedSigFrm = goldQuery.reindex(index=keepList) outF = wkdir + '/' + cellLine + '_top_intra_connecting_compound_classes.v2.txt' reducedSigFrm.to_csv(outF, sep='\t', header=False) ### read in signatures ### ### write to file #### sigList = reducedSigFrm['sig_id'].values ### load in expression data for the two sets of signatures afPath = cmap.score_path gt = gct.GCT() gt.read(src=afPath, cid=sigList, rid='lm_epsilon') outGCT = wkdir + '/' + cellLine + '_top_intra_connecting_compound_classes' gt.write(outGCT, mode='gctx') zFrm = gt.frame # zFrm = zFrm.T # probeIDs = zFrm.columns # ## merge data with # zFrm = pd.concat([zFrm,droppedQ],axis=1) # convert gctx to gct #use java-1.7 # convert gctx to gct so it can be read by R "convert-dataset -i MCF7_top_intra_connecting_compound_classes_n130x978.gctx" cmd1 = 'use Java-1.7' os.system(cmd1) globRes = glob.glob(outGCT + '*.gctx')
def group_probe_frq_plot(self, make_heatmaps=True, sum_score_metric='sum_score_4', rankpt_metric='mean_rankpt_4'): ''' test relative occurance of up/dn regulation of probes for a specific group ''' brd = 'BRD-K02130563' sigs = po.sigIDdict[brd] sig = sigs[0] # afPath = cmap.score_path gt = gct.GCT() gt.read(src=afPath, cid=sigs, rid='lm_epsilon') zFrm = gt.frame # zFrm = pd.DataFrame(data=gt.matrix, # index=gt.get_rids(), # columns=sigs) # take modz of signature group modZed = modzsig.modzsig(zFrm) modZed = modZed.order() #pick a group # grpName = 'tubulin' grpName = 'HDAC-inhibitor' #get all sig_ids for that group grpSigList = [] for brd in self.pclResultDict[grpName]: grpSigList.extend(self.sigIDdict[brd]) #query for up/dn probes cm = mu.CMapMongo() regFrm = cm.find({'sig_id': { '$in': list(grpSigList) }}, { 'sig_id': True, 'pert_id': True, 'pert_iname': True, 'up50_lm': True, 'dn50_lm': True }, toDataFrame=True) # count dn probe freq nInstances = regFrm.shape[0] dnNested = regFrm['dn50_lm'].values dnArray = [item for sublist in dnNested for item in sublist] dnSer = pd.Series(dnArray) dnCounts = dnSer.value_counts() zDnCounts = dnCounts.reindex_like(modZed) # count dn probe freq upNested = regFrm['up50_lm'].values upArray = [item for sublist in upNested for item in sublist] upSer = pd.Series(upArray) upCounts = upSer.value_counts() zUpCounts = upCounts.reindex_like(modZed) # adjust marker size upPercMkrs = np.divide( zUpCounts, nInstances ) #divide by total instances to make for relative frequency dnPercMkrs = np.divide(zDnCounts, nInstances) upMkrs = np.multiply(upPercMkrs, 100) dnMkrs = np.multiply(dnPercMkrs, 100) upMkrs = upMkrs.replace(np.nan, 0) dnMkrs = dnMkrs.replace(np.nan, 0) # make plot fig = plt.figure() ax = fig.add_subplot(111) # ax.plot(s,s,'b') for j, sl in enumerate(modZed): ax.plot(j, 1, 'r.', markersize=upMkrs[j], alpha=.25) ax.plot(j, 1, 'b.', markersize=dnMkrs[j], alpha=.25)
import cmap.analytics.signature_strength as ss import numpy import scipy import cmap.io.gct as gct #import ljh_dose_analysis_tool as dose #plot tool import pylab as pl import matplotlib.pyplot as plt cellLine = 'MCF7' timeP = '24H' gctfile = '/xchip/obelix/pod/brew/pc/ASG001_%s_%s/by_pert_id_pert_dose/ASG001_%s_%s_COMPZ.MODZ_SCORE_LM_n85x978.gctx' % (cellLine,timeP,cellLine,timeP) #make a gct object db = gct.GCT() db.read(gctfile) #make sc object for signature strength sco = sc.SC() sco.add_sc_from_gctx_meta(gctfile) ss = sco.s #make signature for each dose work_dir = '/xchip/cogs/hogstrom/analysis/scratch' #work_dir = os.getcwd() #set work_dir var as pwd fup = '/xchip/cogs/hogstrom/analysis/scratch/tmp_up_list.gmt' fdn = '/xchip/cogs/hogstrom/analysis/scratch/tmp_dn_list.gmt' open(fup,'w') #overwrite existing grp file open(fdn, 'w') #overwrite existing grp file n_edge = 50
cellDirs = [ f for f in os.listdir(work_dir) if os.path.isdir(work_dir + '/' + f) ] prog = progress.DeterminateProgressBar('Drug-target') df = pd.DataFrame() dfRank = pd.DataFrame() #loop through each cell line add to df # for icell, cell1 in enumerate(cgsCells): for icell, cell1 in enumerate(cellDirs): #define directories and load in outputs outdir = os.path.join(work_dir, cell1, 'sig_query_out') if not glob.glob(outdir + '/result_WTCS.LM.COMBINED_n*.gctx'): print cell1 + 'no query result file' continue #if no results file, skip loop rsltFile = glob.glob(outdir + '/result_WTCS.LM.COMBINED_n*.gctx')[0] rslt = gct.GCT() rslt.read(rsltFile) prog.update('analyzing {0}', icell, len(cellDirs)) rsltF = rslt.frame rsltF = rsltF.T indVals = rsltF.index.values pertVals = [ind.split(':')[1][:13] for ind in indVals] #make the column name gene and pert time geneVals = [] for ind in rsltF.columns: gene = ind.split(':')[1] tp = ind.split(':')[0].split('_')[-1] gname = '_'.join([gene, tp]) geneVals.append(gname) if len(geneVals) > len(set(geneVals)): print 'duplicate CGS for this celline'
def build_probe_curves_and_summary(args,work_dir): ''' builds dose response curves for each for the specified probe ''' # instantiate a progress object prog = progress.DeterminateProgressBar('Dose Analysis') # read the specified probe from the input gctx file gcto = gct.GCT() probe_ind = gcto.get_gctx_rid_inds(args.res,match_list=args.probe,exact=True) gcto.read_gctx_matrix(args.res,row_inds=probe_ind) # grab the cids from the file and mine dose information from them. Find all of # the unique perts cids = gcto.get_gctx_cid(args.res) doses = [float(x.split(':')[2]) for x in cids] perts = [x.split(':')[1] for x in cids] unique_perts = list(set(perts)) # for each unique pert_id, find the dose that deviates from the base dose the most. # Do template matching to prototype curves. Output a report num_perts = len(unique_perts) CM = mu.CMapMongo() with open(os.path.join(work_dir,args.probe + '_summary.txt'),'w') as f: headers = ['pert_id','pert_desc','base_dose','base_z_score', 'best_dose','best_z_score', 'best_z_score_delta', 'linear','log','half-log','quarter-log','called shape'] f.write('\t'.join(headers) + '\n') for i,unique_pert in enumerate(unique_perts): prog.update('analyzing {0}'.format(args.probe),i,num_perts) # grab the z-scores and doses for the current pert and sort the pairs # by dose cid_inds = [i for i,x in enumerate(cids) if unique_pert in x] pert_scores = gcto.matrix[0,cid_inds] pert_doses = [doses[x] for x in cid_inds] tmp_tup = zip(pert_doses,pert_scores) tmp_tup.sort() pert_doses,pert_scores = zip(*tmp_tup) # build the dose response plot for the current pert and save it to disk plt.plot(pert_doses,pert_scores) plt.title('::'.join([unique_pert,args.probe])) plt.xlabel('dose') plt.ylabel('z-score') plt.savefig(os.path.join(work_dir,'_'.join([unique_pert.replace(':','_'),args.probe,'dose_curve.png']))) plt.close() # grab the pert_desc from mongo pert_desc = CM.find({'pert_id':unique_pert},{'pert_desc':True},limit=1) if not pert_desc: pert_desc = ['-666'] pert_desc = pert_desc[0] # find the best dose and cast them to lists base_dose = pert_doses[0] base_z_score = pert_scores[0] z_delta = (numpy.array(pert_scores) + 10) - (base_z_score + 10) abs_z_delta = numpy.abs(z_delta) z_delta = z_delta.tolist() abs_z_delta = abs_z_delta.tolist() best_ind = z_delta.index(numpy.min(z_delta)) best_dose = pert_doses[best_ind] best_z_score = pert_scores[best_ind] best_z_score_delta = z_delta[best_ind] if len(pert_doses) > 1: # build prototype curves if there is more than one dose linear = numpy.linspace(1,10,len(pert_doses)) log_gen = _log_gen(1) log_curve = [log_gen.next() for x in range(len(pert_doses))] log_gen = _log_gen(.5) half_log_curve = [log_gen.next() for x in range(len(pert_doses))] log_gen = _log_gen(.25) quarter_log_curve = [log_gen.next() for x in range(len(pert_doses))] curves = numpy.array([linear,log_curve, half_log_curve,quarter_log_curve]) # get the correlation coeficient for each of the curves and the # current pert dose curve corrs = numpy.corrcoef(pert_scores,curves) linear_corr = corrs[0][1] log_corr = corrs[0][2] half_log_corr = corrs[0][3] quarter_log_corr = corrs[0][4] #report the best shape by finding the best absolute correlation abs_corr = numpy.abs(corrs[0][1:]) if numpy.where(abs_corr > .8)[0].size > 0: abs_corr_max = max(abs_corr) abs_corr_max_ind = numpy.where(abs_corr == abs_corr_max)[0][0] curve_names = ['linear','log','half-log','quarter-log'] max_curve_name = curve_names[abs_corr_max_ind] else: max_curve_name = 'none' else: # if there is only one dose, set all corrs to 'nan' linear_corr = 'nan' log_corr = 'nan' half_log_corr = 'nan' quarter_log_corr = 'nan' max_curve_name = 'none' # write the dose data to the summary file data = [unique_pert,pert_desc,str(base_dose),str(base_z_score), str(best_dose),str(best_z_score),str(best_z_score_delta), str(linear_corr),str(log_corr),str(half_log_corr), str(quarter_log_corr),max_curve_name] f.write('\t'.join(data) + '\n') prog.clear()
# read the full data file import cmap.io.gct as gct GCTObject = gct.GCT('path_to_gctx_file') GCTObject.read() print(GCTObject.matrix) # read the first 100 rows and 10 columns of the data import cmap.io.gct as gct GCTObject = gct.GCT('path_to_gctx_file') GCTObject.read(row_inds=range(100), col_inds=range(10)) print(GCTObject.matrix) # get the available meta data headers for data columns and row column_headers = GCTObject.get_chd() row_headers = GCTObject.get_rhd() # get the perturbagen description meta data field from the column data descs = GCTObject.get_column_meta('pert_desc') # get the gene symbol meta data field from the row data symbols = GCTObject.get_row_meta('pr_gene_symbol')
plt.xlabel('median summly connection overlap (out of 50)') plt.ylabel('freq') plt.title('connection consistency across signatures') outF = os.path.join(wkdir, 'median_summly_connection_consistency.png') plt.savefig(outF, bbox_inches=0) plt.close def get_medians(x): return np.median(x) ## which DOS compounds are in summly space? # mtrxSummly = '/xchip/cogs/projects/connectivity/summly/matrices/matched_lass_sym_n7322x7322.gctx' mtrxSummly = '/xchip/cogs/projects/connectivity/summly/matrices/matched_lass_n7147x7147.gctx' # mtrxSummly = '/xchip/cogs/projects/connectivity/summly/matrices/indep_lass_n39560x7147.gctx' gt = gct.GCT() # gt.read_gctx_col_meta(mtrxSummly) # gt.read_gctx_row_meta(mtrxSummly) gt.read(mtrxSummly) columnPerts = gt.get_column_meta('pert_id') summFrm = gt.frame summFrm.columns = columnPerts # find dos cps in summly space summBrds = summFrm.index.values summSet = set(summBrds) dosSet = set(countSer.index) overlapSet = dosSet.intersection(summSet) #plot hist of cell counts - dos in summly overlapSer = countSerGold.reindex(list(overlapSet)) overlapCount = len(overlapSer) ## plot
def template_heatmap(args,work_dir): ''' uses template matching to find the most does responsive probesets for each compound in the dataset and generates a list of the top 50 and bottom 50 most dose responsive probes. heatmaps across all of the doses are made using these probesets ''' # instantiate a progress object prog = progress.DeterminateProgressBar('Template Heatmaps') # read the data gcto = gct.GCT(args.res) gcto.read() # grab the cids from the file and mine dose information from them. Find all of # the unique perts cids = gcto.get_gctx_cid(args.res) pert_descs = gcto.get_column_meta('pert_desc') doses = [float(x.split(':')[2]) for x in cids] perts = [x.split(':')[1] for x in cids] unique_perts = list(set(perts)) # grab the rid for use below rids = gcto.get_gctx_rid(args.res) num_perts = len(unique_perts) for i,unique_pert in enumerate(unique_perts): prog.update('analyzing {0}'.format(unique_pert),i,num_perts) # grab the z-scores and doses for the current pert and sort the pairs # by dose. put the cid_inds in the same sorted order cid_inds = [i for i,x in enumerate(cids) if unique_pert in x] pert_desc = pert_descs[cid_inds[0]] #set pert desc to the first dose pert_doses = [doses[x] for x in cid_inds] tmp_tup = zip(pert_doses,cid_inds) tmp_tup.sort() pert_doses,cid_inds = zip(*tmp_tup) if len(pert_doses) > 1: # build prototype curves if there is more than one dose linear = numpy.linspace(1,10,len(pert_doses)) log_gen = _log_gen(1) log_curve = [log_gen.next() for x in range(len(pert_doses))] log_gen = _log_gen(.5) half_log_curve = [log_gen.next() for x in range(len(pert_doses))] log_gen = _log_gen(.25) quarter_log_curve = [log_gen.next() for x in range(len(pert_doses))] curves = numpy.array([linear,log_curve, half_log_curve,quarter_log_curve]) # correlate all of the probes in the data to the prototype curves pert_data = gcto.matrix[:,cid_inds] num_probes = pert_data.shape[0] cc = numpy.corrcoef(pert_data,curves) # grab the correlation values for all the probes against prototype curves linear_probe_corrs = cc[0:num_probes,num_probes] log_probe_corrs = cc[0:num_probes,num_probes + 1] half_log_probe_corrs = cc[0:num_probes,num_probes + 2] quarter_log_probe_corrs = cc[0:num_probes,num_probes + 3] # compute the random correlation profile for this pert num_probes = gcto.matrix.shape[0] probe_inds = range(num_probes) linear_perm_cc = [] log_perm_cc = [] half_log_perm_cc = [] quarter_log_perm_cc = [] for i in range(1000): perm_curve_inds = [random.sample(probe_inds,1)[0] for x in range(len(pert_doses))] perm_curve = [pert_data[perm_curve_inds[x],x] for x in range(len(pert_doses))] perm_covar = numpy.corrcoef(perm_curve,curves) linear_perm_cc.append(perm_covar[0][1]) log_perm_cc.append(perm_covar[0][2]) half_log_perm_cc.append(perm_covar[0][3]) quarter_log_perm_cc.append(perm_covar[0][4]) # compute the nominal p values for all correlation values linear_probe_corrs_p = numpy.array([stats.percentileofscore(linear_perm_cc,x) for x in linear_probe_corrs]) log_probe_corrs_p = numpy.array([stats.percentileofscore(log_perm_cc,x) for x in log_probe_corrs]) half_log_probe_corrs_p = numpy.array([stats.percentileofscore(half_log_perm_cc,x) for x in half_log_probe_corrs]) quarter_log_probe_corrs_p = numpy.array([stats.percentileofscore(quarter_log_perm_cc,x) for x in quarter_log_probe_corrs]) # write the p values and correlations out to file with open(os.path.join(work_dir,unique_pert + '_template_match_summary.txt'),'w') as f: f.write('\t'.join(['probeset','linear corr', 'linear p','log corr', 'log p', 'half-log corr', 'half-log p','quarter-log corr', 'quarter-log p']) + '\n') for j in range(len(linear_probe_corrs)): f.write('\t'.join([rids[j],str(linear_probe_corrs[j]), str(linear_probe_corrs_p[j]) ,str(log_probe_corrs[j]), str(log_probe_corrs_p[j]) ,str(half_log_probe_corrs[j]), str(half_log_probe_corrs_p[j]) ,str(quarter_log_probe_corrs[j]), str(quarter_log_probe_corrs_p[j])]) + '\n') # build the linear heatmap linear_probe_corrs_sort_ind = numpy.argsort(linear_probe_corrs_p)[::-1] top = pert_data[linear_probe_corrs_sort_ind[0:50],:] bot = pert_data[linear_probe_corrs_sort_ind[-50:],:] combined = numpy.vstack([top,bot]) combined_row_normalized = combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T) row_sums = combined_row_normalized.sum(axis=1) combined_row_normalized = combined_row_normalized / row_sums[:,numpy.newaxis] plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu') plt.axis('off') plt.savefig(os.path.join(work_dir,unique_pert + '_linear_heatmap.png')) # build the log heatmap log_probe_corrs_sort_ind = numpy.argsort(log_probe_corrs_p)[::-1] top = pert_data[log_probe_corrs_sort_ind[0:50],:] bot = pert_data[log_probe_corrs_sort_ind[-50:],:] combined = numpy.vstack([top,bot]) combined_row_normalized = combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T) row_sums = combined_row_normalized.sum(axis=1) combined_row_normalized = combined_row_normalized / row_sums[:,numpy.newaxis] plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu') plt.axis('off') plt.savefig(os.path.join(work_dir,unique_pert + '_log_heatmap.png')) # build the half log heatmap half_log_probe_corrs_sort_ind = numpy.argsort(half_log_probe_corrs_p)[::-1] top = pert_data[half_log_probe_corrs_sort_ind[0:50],:] bot = pert_data[half_log_probe_corrs_sort_ind[-50:],:] combined = numpy.vstack([top,bot]) combined_row_normalized = combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T) row_sums = combined_row_normalized.sum(axis=1) combined_row_normalized = combined_row_normalized / row_sums[:,numpy.newaxis] plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu') plt.axis('off') plt.savefig(os.path.join(work_dir,unique_pert + '_half_log_heatmap.png')) # build the quarter log heatmap quarter_log_probe_corrs_sort_ind = numpy.argsort(quarter_log_probe_corrs_p)[::-1] top = pert_data[quarter_log_probe_corrs_sort_ind[0:50],:] bot = pert_data[quarter_log_probe_corrs_sort_ind[-50:],:] combined = numpy.vstack([top,bot]) combined_row_normalized = combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T) row_sums = combined_row_normalized.sum(axis=1) combined_row_normalized = combined_row_normalized / row_sums[:,numpy.newaxis] plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu') plt.axis('off') plt.savefig(os.path.join(work_dir,pert_desc + '_quarter_log_heatmap.png')) # clear that progress bar prog.clear()
import cmap.analytics.signature_strength as ss import cmap.util.mongo_utils as mu #plot tool import pylab as pl import matplotlib.pyplot as plt #work_dir = '/xchip/cogs/hogstrom/analysis/scratch/Nov27' #MCF7 24h work_dir = '/xchip/cogs/hogstrom/analysis/scratch/Nov29/dose_analysis_tool.1354211763774' #pc3 6h cellLine = 'PC3' timeP = '6H' gctfile = '/xchip/obelix/pod/brew/pc/ASG001_%s_%s/by_pert_id_pert_dose/ASG001_%s_%s_COMPZ.MODZ_SCORE_LM_n85x978.gctx' % ( cellLine, timeP, cellLine, timeP) #make a gct object db = gct.GCT() db.read(gctfile) ### ss calculations ### SS1 = ss.SigStrength() SS1.sig_strength_from_gct_file(gctfile, do_zthresh=False) SS2 = ss.SigStrength() SS2.sig_strength_from_gct_file(gctfile, do_zthresh=True) #ss with threshold qPert = db.get_column_meta('pert_desc') qPertID = db.get_column_meta('pert_id') qDose = db.get_column_meta('pert_dose') ## plot ss orig with dose SSin = SS1.ss ssMax = numpy.nanmax(SSin)
import cmap.util.mongo_utils as mu import cmap.io.gct as gct import cmap.io.gmt as gmt import cmap.analytics.NMF_benchmarks as nmfb ### cell line gcts w/ annotation # gFile = '/xchip/cogs/web/icmap/custom/TA/brew/pc/TA.OE013_A549_96H/TA.OE013_A549_96H_QNORM_n1117x978.gctx' # gt_plate = gct.GCT(src=gFile) # gt_plate.read() # ds_plate = gt_plate.frame file_modz = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/TA_JUN10_COMPZ.MODZ_SCORE_n13974x22268.gctx' file_qnorm = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/TA_JUN10_QNORM_n38534x978.gctx' file_zspcinf = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/TA_JUN10_ZSPCINF_n38534x22268.gctx' gt = gct.GCT(src=file_modz) gt.read() ds = gt.frame wkdir = '/xchip/cogs/projects/NMF/TA_lung_OE_June_2014/gctx_files_annotated/MODZ_INF' if not os.path.exists(wkdir): os.mkdir(wkdir) # # save matrix for each cell line in OE experiments # is_oe = colFrame.plate.str.match('TA.OE0') # oe = colFrame[is_oe] # cell_grped = oe.groupby('cell_line') # for grpT in cell_grped: # cell = grpT[0] # cellDir = wkdir + '/' + cell # if not os.path.exists(cellDir):
def analyze_query(args,work_dir): ''' Analyze the output from query_tool - find self-connections and create graphs ''' #make a gct object db = gct.GCT() db.read(args.res) ##load query result - gctx file rslt = gct.GCT() #if specific result directory is specified, use that - otherwise get gctx from working dir if args.result: outGctx = glob.glob(os.path.join(work_dir, '*COMBINED*.gctx')) #select combined result gctx in working dir created from build_query step rslt.read(outGctx[0]) else: rslt.read(args.resultDir) rsltSigID = rslt.get_rids() #sig IDs from result file qPert = db.get_column_meta('pert_desc') qPertID = db.get_column_meta('pert_id') qDose = db.get_column_meta('pert_dose') ESmat = rslt.matrix iES = ESmat.argsort(axis=0)[::-1] #sort ascending n_inst = len(iES[:,1]) #loop through each of the perts - graph ranks of query prog1 = progress.DeterminateProgressBar('creating self-connection graphs') avRnk = [] medRnk = [] for i, x in enumerate(qPert): prog1.update('graphing {0}',i,len(qPert)) iE = iES[:,i] #ES sort index for one column sSigID = [] for y in iE: sSigID.append(rsltSigID[y]) #make sorted sig ID list qStr = qPertID[i] cmpd1 = x dose1 = qDose[i] if len(qStr) >= 13: qStr = qStr[0:13] #shorten qPertID #i1 = IDsorted.index(qStr) #give first index of match #run pymongo query CM = mu.CMapMongo() #cmpdSigIds = CM.find({'pert_id':qStr},{'sig_id':True}) cmpdSigIds = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True}) #search for the BRD-xxxxxxxxxxx within the pert_id field in the db #i1 = __all_indices(qStr,sSigID) i1 = [sSigID.index(y) for y in cmpdSigIds] #where instances of the compound of interest sit on the rank list if len(i1) < 1: print cmpd1 + ' has no instances in the cmap database' continue i2 = numpy.array(i1) #convert list to numpy array avr = sum(i2)/len(i2) #what is the average ES rank md = numpy.median(i2) # what is the median ES rank nAv = float(avr)/n_inst #normalize acording to number of instances in db nMd = float(md)/len(iES[:,1]) #normalized median avRnk.append(nAv) #store average ES rank medRnk.append(nMd) #plot fname = cmpd1 + '_' + dose1 + '_query_rank.png' outf = os.path.join(work_dir,fname) fig = plt.figure(figsize=(8.0, 2.0)) ax = fig.add_subplot(111) # the histogram of the data n, bins, patches = ax.hist(i2, 30, facecolor='green', alpha=0.75) #ax.set_xlim(0, n_inst) ax.set_xlim(0, int(round(n_inst,-5))) #round instances to nearest 100k ax.set_xlabel('query rank') ax.set_ylabel('freq') ax.set_title('dose = '+ str(dose1) +'um') ax.grid(True) plt.savefig(outf, bbox_inches=0)