def get_dos_signatures(dosBrds): "1) return signature info for all DOS compounds \ 2) return list counts of number of cell lines tested" CM = mu.CMapMongo() dosQuery = CM.find( { 'pert_id': { '$in': list(dosBrds) }, 'pert_type': 'trt_cp' }, #, { 'sig_id': True, 'pert_id': True, 'cell_id': True, 'pert_time': True, 'is_gold': True, 'pert_iname': True, 'distil_ss': True, 'distil_cc_q75': True }, toDataFrame=True) dosQuery.index = dosQuery['sig_id'] dosSetLen = len(set(dosQuery['pert_id'])) dosGrped = dosQuery.groupby(['pert_id']) countDict = {} for grp in dosGrped: grpName = grp[0] cellSet = set(grp[1]['cell_id']) nCells = len(cellSet) countDict[grpName] = nCells countSer = pd.Series(countDict) countMax = max(countSer) return dosQuery, countSer
def build_probe_curves(args,work_dir): ''' builds dose response curves for the specified probe ''' gcto = gct.GCT() probe_ind = gcto.get_gctx_rid_inds(args.res,match_list=args.probe,exact=True) gcto.read_gctx_matrix(args.res,row_inds=probe_ind) cids = gcto.get_gctx_cid(args.res) doses = [float(x.split(':')[2]) for x in cids] CM = mu.CMapMongo() with open(os.path.join(work_dir,args.probe + '_summary.txt'),'w') as f: headers = ['pert_id','pert_desc','base_dose','base_z_score', 'best_dose','best_z_score', 'best_z_score_delta'] f.write('\t'.join(headers) + '\n') for i,unique_pert in enumerate(unique_perts): prog.update('analyzing {0}'.format(args.probe),i,num_perts) cid_inds = [i for i,x in enumerate(cids) if unique_pert in x] pert_scores = gcto.matrix[0,cid_inds] pert_doses = [doses[x] for x in cid_inds] tmp_tup = zip(pert_doses,pert_scores) tmp_tup.sort() pert_doses,pert_scores = zip(*tmp_tup) plt.plot(pert_doses,pert_scores) plt.title('::'.join([unique_pert,args.probe])) plt.xlabel('dose') plt.ylabel('z-score') plt.savefig(os.path.join(work_dir,'_'.join([unique_pert.replace(':','_'),args.probe,'dose_curve.png']))) plt.close() pert_desc = CM.find({'pert_id':unique_pert},{'pert_desc':True},limit=1) if not pert_desc: pert_desc = ['-666'] pert_desc = pert_desc[0] base_dose = pert_doses[0] base_z_score = pert_scores[0] z_delta = (numpy.array(pert_scores) + 10) - (base_z_score + 10) abs_z_delta = numpy.abs(z_delta) z_delta = z_delta.tolist() abs_z_delta = abs_z_delta.tolist() best_ind = z_delta.index(numpy.min(z_delta)) best_dose = pert_doses[best_ind] best_z_score = pert_scores[best_ind] best_z_score_delta = z_delta[best_ind] data = [unique_pert,pert_desc,str(base_dose),str(base_z_score), str(best_dose),str(best_z_score),str(best_z_score_delta)] f.write('\t'.join(data) + '\n') prog.clear()
def get_inames(self): ''' get pert_inames for each input compound ''' cm = mu.CMapMongo() inameID = cm.find({'pert_id': { '$in': list(self.cpSet) }}, { 'pert_id': True, 'pert_iname': True }, toDataFrame=True) inameSer = pd.Series(data=inameID['pert_iname']) inameSer.index = inameID['pert_id'] inameDict = inameSer.to_dict() self.inameDict = inameDict
def PCL_vs_DMSO(self, max_signatures_per_cp=20, n_test_max=False): ''' -grab equal amounts of DMSO and signatures from a PCL class -test one PCL at a time Parameters ---------- n_test_max : int -max number of PCL groups to incorporate into the classifier -if set to False, all groups are tested ''' for group_name in self.test_groups: group_cps = self.pclDict[group_name] CM = mu.CMapMongo() # set minimum dose cpQuery = CM.find( { 'is_gold': True, 'pert_id': { '$in': group_cps }, 'pert_dose': { '$gt': 1 } }, #, { 'sig_id': True, 'pert_id': True, 'cell_id': True, 'pert_time': True, 'is_gold': True, 'pert_iname': True }, toDataFrame=True) # inameGrped = cpQuery.groupby('pert_iname') cpQuery.index = cpQuery['sig_id'] cpQuery = self.set_class_labels(cpQuery) droppedQ = self.cut_signatures(cpQuery, nKeep=max_signatures_per_cp, cut_by='pert_iname') droppedGrped = droppedQ.groupby('pert_iname') droppedGrped.size()
work_dir = '/xchip/cogs/projects/HOG/DG_connect' #load in OMIM genes. Which ones have a CGS in > 4 cell lines? which ones are LM? inFile = '/xchip/cogs/hogstrom/analysis/OMIM/OMIM_CGS.txt' omimGeneList = [] with open(inFile, 'rt') as f: for string in f: splt = string.split('\r') for i, line in enumerate(splt): if i == 0: # skip headder continue splt2 = line.split('\t') geneID = splt2[0] #the pert_id listed the line omimGeneList.append(geneID) CM = mutil.CMapMongo() CGSall = CM.find({'pert_type': 'trt_sh.cgs'}, { 'sig_id': True, 'pert_iname': True, 'cell_id': True, 'pert_id': True }) #which drugs to use --> informer set and HOG plate ### which genes have a CGS in > 4 cell lines ominWithContext = [] for geneID in omimGeneList: cellLst = [] sigIDLst = [] for q in CGSall: if q['pert_iname'] == geneID:
### use pert_info collection to get sig_ids in mongo # cellList = [] # for pert in targetDict.keys()[:10]: # pertdb = mutil.CMapMongo(mongo_location = None, collection = 'pert_info') # p1 = pertdb.find({'pert_id':'BRD-M79902621'},{'sig_id':True}) # g = p1[0] # gSplit = g.split('\'') # sigIDs = [x for x in gSplit if len(x) >= 5] # cells = [x.split('_')[1] for x in sigIDs] # cellList.extend(cells) # print p1 # type(p1[0]) ### which targets have CGS signatures #get all CGS gene IDs CM = mu.CMapMongo() # pert_List = CM.find({'pert_type':{'$regex':pert}},{'sig_id':True,'cell_id':True}) CGSbyCell = CM.find({'pert_type': 'trt_sh.cgs'}, {'pert_iname': True}) CGSgeneSyms = set(CGSbyCell) #check overlap with DB targets nestedTargets = targetDict.values() DBtargets = [item for sublist in nestedTargets for item in sublist] setDBtargets = set(DBtargets) DBcgsOverlap = setDBtargets.intersection(CGSgeneSyms) targetDictCGS = {} for pert in targetDict: for gene in targetDict[pert]: if gene in DBcgsOverlap: if targetDictCGS.has_key(pert): targetDictCGS[pert].append(gene)
def group_probe_frq_plot(self, make_heatmaps=True, sum_score_metric='sum_score_4', rankpt_metric='mean_rankpt_4'): ''' test relative occurance of up/dn regulation of probes for a specific group ''' brd = 'BRD-K02130563' sigs = po.sigIDdict[brd] sig = sigs[0] # afPath = cmap.score_path gt = gct.GCT() gt.read(src=afPath, cid=sigs, rid='lm_epsilon') zFrm = gt.frame # zFrm = pd.DataFrame(data=gt.matrix, # index=gt.get_rids(), # columns=sigs) # take modz of signature group modZed = modzsig.modzsig(zFrm) modZed = modZed.order() #pick a group # grpName = 'tubulin' grpName = 'HDAC-inhibitor' #get all sig_ids for that group grpSigList = [] for brd in self.pclResultDict[grpName]: grpSigList.extend(self.sigIDdict[brd]) #query for up/dn probes cm = mu.CMapMongo() regFrm = cm.find({'sig_id': { '$in': list(grpSigList) }}, { 'sig_id': True, 'pert_id': True, 'pert_iname': True, 'up50_lm': True, 'dn50_lm': True }, toDataFrame=True) # count dn probe freq nInstances = regFrm.shape[0] dnNested = regFrm['dn50_lm'].values dnArray = [item for sublist in dnNested for item in sublist] dnSer = pd.Series(dnArray) dnCounts = dnSer.value_counts() zDnCounts = dnCounts.reindex_like(modZed) # count dn probe freq upNested = regFrm['up50_lm'].values upArray = [item for sublist in upNested for item in sublist] upSer = pd.Series(upArray) upCounts = upSer.value_counts() zUpCounts = upCounts.reindex_like(modZed) # adjust marker size upPercMkrs = np.divide( zUpCounts, nInstances ) #divide by total instances to make for relative frequency dnPercMkrs = np.divide(zDnCounts, nInstances) upMkrs = np.multiply(upPercMkrs, 100) dnMkrs = np.multiply(dnPercMkrs, 100) upMkrs = upMkrs.replace(np.nan, 0) dnMkrs = dnMkrs.replace(np.nan, 0) # make plot fig = plt.figure() ax = fig.add_subplot(111) # ax.plot(s,s,'b') for j, sl in enumerate(modZed): ax.plot(j, 1, 'r.', markersize=upMkrs[j], alpha=.25) ax.plot(j, 1, 'b.', markersize=dnMkrs[j], alpha=.25)
def analyze_query(args,work_dir): ''' Analyze the output from query_tool - find self-connections and create graphs ''' #make a gct object db = gct.GCT() db.read(args.res) ##load query result - gctx file rslt = gct.GCT() #if specific result directory is specified, use that - otherwise get gctx from working dir if args.result: outGctx = glob.glob(os.path.join(work_dir, '*COMBINED*.gctx')) #select combined result gctx in working dir created from build_query step rslt.read(outGctx[0]) else: rslt.read(args.resultDir) rsltSigID = rslt.get_rids() #sig IDs from result file qPert = db.get_column_meta('pert_desc') qPertID = db.get_column_meta('pert_id') qDose = db.get_column_meta('pert_dose') ESmat = rslt.matrix iES = ESmat.argsort(axis=0)[::-1] #sort ascending n_inst = len(iES[:,1]) #loop through each of the perts - graph ranks of query prog1 = progress.DeterminateProgressBar('creating self-connection graphs') avRnk = [] medRnk = [] for i, x in enumerate(qPert): prog1.update('graphing {0}',i,len(qPert)) iE = iES[:,i] #ES sort index for one column sSigID = [] for y in iE: sSigID.append(rsltSigID[y]) #make sorted sig ID list qStr = qPertID[i] cmpd1 = x dose1 = qDose[i] if len(qStr) >= 13: qStr = qStr[0:13] #shorten qPertID #i1 = IDsorted.index(qStr) #give first index of match #run pymongo query CM = mu.CMapMongo() #cmpdSigIds = CM.find({'pert_id':qStr},{'sig_id':True}) cmpdSigIds = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True}) #search for the BRD-xxxxxxxxxxx within the pert_id field in the db #i1 = __all_indices(qStr,sSigID) i1 = [sSigID.index(y) for y in cmpdSigIds] #where instances of the compound of interest sit on the rank list if len(i1) < 1: print cmpd1 + ' has no instances in the cmap database' continue i2 = numpy.array(i1) #convert list to numpy array avr = sum(i2)/len(i2) #what is the average ES rank md = numpy.median(i2) # what is the median ES rank nAv = float(avr)/n_inst #normalize acording to number of instances in db nMd = float(md)/len(iES[:,1]) #normalized median avRnk.append(nAv) #store average ES rank medRnk.append(nMd) #plot fname = cmpd1 + '_' + dose1 + '_query_rank.png' outf = os.path.join(work_dir,fname) fig = plt.figure(figsize=(8.0, 2.0)) ax = fig.add_subplot(111) # the histogram of the data n, bins, patches = ax.hist(i2, 30, facecolor='green', alpha=0.75) #ax.set_xlim(0, n_inst) ax.set_xlim(0, int(round(n_inst,-5))) #round instances to nearest 100k ax.set_xlabel('query rank') ax.set_ylabel('freq') ax.set_title('dose = '+ str(dose1) +'um') ax.grid(True) plt.savefig(outf, bbox_inches=0)
def build_probe_curves_and_summary(args,work_dir): ''' builds dose response curves for each for the specified probe ''' # instantiate a progress object prog = progress.DeterminateProgressBar('Dose Analysis') # read the specified probe from the input gctx file gcto = gct.GCT() probe_ind = gcto.get_gctx_rid_inds(args.res,match_list=args.probe,exact=True) gcto.read_gctx_matrix(args.res,row_inds=probe_ind) # grab the cids from the file and mine dose information from them. Find all of # the unique perts cids = gcto.get_gctx_cid(args.res) doses = [float(x.split(':')[2]) for x in cids] perts = [x.split(':')[1] for x in cids] unique_perts = list(set(perts)) # for each unique pert_id, find the dose that deviates from the base dose the most. # Do template matching to prototype curves. Output a report num_perts = len(unique_perts) CM = mu.CMapMongo() with open(os.path.join(work_dir,args.probe + '_summary.txt'),'w') as f: headers = ['pert_id','pert_desc','base_dose','base_z_score', 'best_dose','best_z_score', 'best_z_score_delta', 'linear','log','half-log','quarter-log','called shape'] f.write('\t'.join(headers) + '\n') for i,unique_pert in enumerate(unique_perts): prog.update('analyzing {0}'.format(args.probe),i,num_perts) # grab the z-scores and doses for the current pert and sort the pairs # by dose cid_inds = [i for i,x in enumerate(cids) if unique_pert in x] pert_scores = gcto.matrix[0,cid_inds] pert_doses = [doses[x] for x in cid_inds] tmp_tup = zip(pert_doses,pert_scores) tmp_tup.sort() pert_doses,pert_scores = zip(*tmp_tup) # build the dose response plot for the current pert and save it to disk plt.plot(pert_doses,pert_scores) plt.title('::'.join([unique_pert,args.probe])) plt.xlabel('dose') plt.ylabel('z-score') plt.savefig(os.path.join(work_dir,'_'.join([unique_pert.replace(':','_'),args.probe,'dose_curve.png']))) plt.close() # grab the pert_desc from mongo pert_desc = CM.find({'pert_id':unique_pert},{'pert_desc':True},limit=1) if not pert_desc: pert_desc = ['-666'] pert_desc = pert_desc[0] # find the best dose and cast them to lists base_dose = pert_doses[0] base_z_score = pert_scores[0] z_delta = (numpy.array(pert_scores) + 10) - (base_z_score + 10) abs_z_delta = numpy.abs(z_delta) z_delta = z_delta.tolist() abs_z_delta = abs_z_delta.tolist() best_ind = z_delta.index(numpy.min(z_delta)) best_dose = pert_doses[best_ind] best_z_score = pert_scores[best_ind] best_z_score_delta = z_delta[best_ind] if len(pert_doses) > 1: # build prototype curves if there is more than one dose linear = numpy.linspace(1,10,len(pert_doses)) log_gen = _log_gen(1) log_curve = [log_gen.next() for x in range(len(pert_doses))] log_gen = _log_gen(.5) half_log_curve = [log_gen.next() for x in range(len(pert_doses))] log_gen = _log_gen(.25) quarter_log_curve = [log_gen.next() for x in range(len(pert_doses))] curves = numpy.array([linear,log_curve, half_log_curve,quarter_log_curve]) # get the correlation coeficient for each of the curves and the # current pert dose curve corrs = numpy.corrcoef(pert_scores,curves) linear_corr = corrs[0][1] log_corr = corrs[0][2] half_log_corr = corrs[0][3] quarter_log_corr = corrs[0][4] #report the best shape by finding the best absolute correlation abs_corr = numpy.abs(corrs[0][1:]) if numpy.where(abs_corr > .8)[0].size > 0: abs_corr_max = max(abs_corr) abs_corr_max_ind = numpy.where(abs_corr == abs_corr_max)[0][0] curve_names = ['linear','log','half-log','quarter-log'] max_curve_name = curve_names[abs_corr_max_ind] else: max_curve_name = 'none' else: # if there is only one dose, set all corrs to 'nan' linear_corr = 'nan' log_corr = 'nan' half_log_corr = 'nan' quarter_log_corr = 'nan' max_curve_name = 'none' # write the dose data to the summary file data = [unique_pert,pert_desc,str(base_dose),str(base_z_score), str(best_dose),str(best_z_score),str(best_z_score_delta), str(linear_corr),str(log_corr),str(half_log_corr), str(quarter_log_corr),max_curve_name] f.write('\t'.join(data) + '\n') prog.clear()
def load_expression_data(self, max_signatures_per_cp=3, groups_to_model=None, keep_by_cell_line=False): ''' -search for z-score data of compounds that fall into one of the different classes -limit the number of signatures per compound -load in z-score data signatures Parameters ---------- groups_to_model : list -list of group names in the pclDict -default is to use all keys max_signatures_per_cp : int maximum number of signatures per compound to incorporate into the classifier (to avoid overfitting to compounds with many signatures) keep_by_cell_line : bool -if True - keep n number of signatues per cell line -if False - keep first n signatures regardless of cell line ''' if groups_to_model == None: groups_to_model = self.pclDict.keys() brdAllGroups = [] for group in groups_to_model: brdAllGroups.extend(self.pclDict[group]) CM = mu.CMapMongo() # set minimum dose goldQuery = CM.find( { 'is_gold': True, 'pert_id': { '$in': brdAllGroups }, 'pert_dose': { '$gt': 1 } }, #, { 'sig_id': True, 'pert_id': True, 'cell_id': True, 'pert_time': True, 'is_gold': True, 'pert_iname': True }, toDataFrame=True) goldQuery.index = goldQuery['sig_id'] # asign drug class labels goldQuery = self.set_class_labels(goldQuery) # reduce signatures to prevent overfitting to one compound droppedQ = self.cut_signatures(goldQuery, nKeep=max_signatures_per_cp, keep_by_cell_line=keep_by_cell_line) sigList = droppedQ['sig_id'].values ### load in expression data for the two sets of signatures afPath = cmap.score_path gt = gct.GCT() gt.read(src=afPath, cid=sigList, rid='lm_epsilon') zFrm = gt.frame zFrm = zFrm.T probeIDs = zFrm.columns self.probe_ids = probeIDs ## merge data with zFrm = pd.concat([zFrm, droppedQ], axis=1) self.signature_frame = zFrm
def classification_by_cell(self, loo_type='by_cp'): ''' -For each of the specified cell lines, build a separate classifier -evaluate model with leave one out cross val. Parameters ---------- loo_type : str strategy for leave one out validation: 'by_cp' - leaves out all signatures for a given compounds 'by_sig' - leaves out individual signatures ''' combinedFrm = pd.DataFrame() accuracyDict = {} for cellLine in self.core_cell_lines: CM = mu.CMapMongo() # goldQuery = CM.find({'is_gold' : True,'pert_id':{'$in':brdAllGroups},'cell_id':cellLine}, #, # {'sig_id':True,'pert_id':True,'cell_id':True,'pert_time':True,'is_gold':True,'pert_iname':True}, # toDataFrame=True) # set minimum dose goldQuery = CM.find( { 'is_gold': True, 'pert_id': { '$in': self.all_group_cps }, 'cell_id': cellLine, 'pert_dose': { '$gt': 1 } }, #, { 'sig_id': True, 'pert_id': True, 'cell_id': True, 'pert_time': True, 'is_gold': True, 'pert_iname': True }, toDataFrame=True) goldQuery.index = goldQuery['sig_id'] # asign drug class labels goldQuery = self.set_class_labels(goldQuery) # reduce signatures to prevent overfitting to one compound droppedQ = self.cut_signatures(goldQuery) sigList = droppedQ['sig_id'].values ### load in expression data for the two sets of signatures afPath = cmap.score_path gt = gct.GCT() gt.read(src=afPath, cid=sigList, rid='lm_epsilon') zFrm = gt.frame zFrm = zFrm.T probeIDs = zFrm.columns ## merge data with zFrm = pd.concat([zFrm, droppedQ], axis=1) ### perform leave one out validation if loo_type == 'by_cp': zFrm['svm_prediction'] = np.nan cpSet = set(zFrm['pert_id']) # loop through the compounds - leave out in building the model then test for brd in cpSet: brd_match = zFrm['pert_id'] == brd droppedFrm = zFrm[ ~brd_match] # remove test signature from training trainFrm = droppedFrm.reindex(columns=probeIDs) labelsTrain = droppedFrm['labels'].values C = 1.0 # SVM regularization parameter svc = svm.SVC(kernel='linear', C=C).fit(trainFrm.values, labelsTrain) zTest = zFrm.ix[brd_match, probeIDs] linPred = svc.predict(zTest.values) zFrm['svm_prediction'][zTest.index] = linPred if loo_type == 'by_sig': predictDict = {} for sig in zFrm.index: droppedFrm = zFrm[ zFrm.index != sig] # remove test signature from training trainFrm = droppedFrm.reindex(columns=probeIDs) labelsTrain = droppedFrm['labels'].values C = 1.0 # SVM regularization parameter svc = svm.SVC(kernel='linear', C=C).fit(trainFrm.values, labelsTrain) zTest = zFrm.ix[sig, probeIDs] linPred = svc.predict(zTest.values) predictDict[sig] = linPred[0] predSer = pd.Series(predictDict) predSer.name = 'svm_prediction' zFrm = pd.concat([zFrm, pd.DataFrame(predSer)], axis=1) combinedFrm = pd.concat([combinedFrm, zFrm], axis=0) accuracyArray = zFrm['labels'] == zFrm['svm_prediction'] accuracyRate = accuracyArray.sum() / float(accuracyArray.shape[0]) accuracyDict[cellLine] = accuracyRate self.modelFrame = combinedFrm self.model_accuracy = accuracyDict
with open(targetSheetF, 'rt') as f: for string in f: splt = string.split('\r') for i, line in enumerate(splt): splt2 = line.split('\t') pID = splt2[0] #the pert_id listed the line pDesc = splt2[1] targets = splt2[2] targets = targets.split(';') if targets[0] == '' or targets[0] == '?' or targets[0] == '-666': continue else: targetDict[pID] = targets pDescDict[pID] = pDesc db = mu.CMapMongo() test1 = db.find({ 'cell_id': 'A375', 'is_gold': True, 'pert_type': 'trt_oe' }, {'sig_id': 1}) test2 = db.find({ 'cell_id': 'A375', 'is_gold': True, 'pert_type': 'trt_sh' }, {'sig_id': 1}) test1 = test1[1:10] test2 = test2[1:10] # t = dgo.Oracle(test1,test2,out=work_dir + '/Oracle') # t.compute_scores() # t.get_results()
def external_qq(args,work_dir): ''' make a qq plot of each unique instance - plot the size of each probe acording to how often it occurs in the affogato top/bottom 50 list ''' #make a gct object db = gct.GCT() db.read(args.res) qPert = db.get_column_meta('pert_desc') qPertID = db.get_column_meta('pert_id') qDose = db.get_column_meta('pert_dose') probeIDs = db.get_row_meta('id') #set null distirbution of z-scores (currently normal) ESmat = db.matrix #calculate null distribution mu, sigma = 0, 1 s = numpy.random.normal(mu, sigma, len(ESmat[:,1])) s.sort() pertSet = set(qPert) for pert in pertSet: iP = _all_indices(pert, qPert) #index of doses on plate if len(iP) < 2: print pert + ' has only one instance' continue uDose = [qDose[i] for i in iP] fDose = [float(x) for x in uDose] #convert strings to float aDose = numpy.asarray(fDose) #convert to numpy array iD = aDose.argsort() #local ordering sDose = [fDose[j] for j in iD] #sort local doses iPo = [iP[i] for i in iD] #ordered index #sMat = ESmat[:,iPo] #sMat.sort(axis=0) #mongo query for each unique pertID qStr = qPertID[iPo[0]] #set pertID if len(qStr) >= 13: qStr = qStr[0:13] #shorten qPertID CM = mutil.CMapMongo() #cmpdSigIds = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True}) edge50Lst = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True,'up50_lm':True,'dn50_lm':True,'cell_id':True}) #search for the BRD-xxxxxxxxxxx within the pert_id field in the db nInstances = len(edge50Lst) #number of instances in db #count number of times a probe is in the top/bottom 50 genes of an instance upProbeCnts = [0] * len(probeIDs) dnProbeCnts = [0] * len(probeIDs) for j,inst in enumerate(edge50Lst): up50 = edge50Lst[j]['up50_lm'] dn50 = edge50Lst[j]['dn50_lm'] #loop through every gene in the top and bottom list - where does it live on the rank list? for prb in up50: if prb in probeIDs: iPrb = probeIDs.index(prb) upProbeCnts[iPrb] = upProbeCnts[iPrb] +1 for prb in dn50: if prb in probeIDs: iPrb = probeIDs.index(prb) dnProbeCnts[iPrb] = dnProbeCnts[iPrb] +1 #loop through each dose for d in iPo: #count probe enrichment and plot cmpd1 = qPert[d] dose1 = qDose[d] zLst = db.matrix[:,d] iLst = zLst.argsort() #sort z-scores and save index sLst = zLst[iLst] sUpProbeCnts = [upProbeCnts[l] for l in iLst] #sort probe counts acording to z-score sDnProbeCnts = [dnProbeCnts[l] for l in iLst] #mkrs = numpy.sqrt(sprobeCnts) # non linear scaling of marker points sUpProbeCnts = [float(l) for l in sUpProbeCnts] #convert to float sDnProbeCnts = [float(l) for l in sDnProbeCnts] #convert to float # upPercMkrs = numpy.divide(sUpProbeCnts,max(sUpProbeCnts)) #divide by max count to make for relative frequency # dnPercMkrs = numpy.divide(sDnProbeCnts,max(sDnProbeCnts)) upPercMkrs = numpy.divide(sUpProbeCnts,nInstances) #divide by total instances to make for relative frequency dnPercMkrs = numpy.divide(sDnProbeCnts,nInstances) upMkrs = numpy.multiply(upPercMkrs,100) dnMkrs = numpy.multiply(dnPercMkrs,100) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(s,s,'b') for j,sl in enumerate(sLst): ax.plot(s[j],sl,'r.',markersize=upMkrs[j],alpha=.25) ax.plot(s[j],sl,'b.',markersize=dnMkrs[j],alpha=.25) ax.set_ylabel('observed z-score') ax.set_xlabel('expected z-score') # #set legend based on the number of r1 = ax.plot(0,0,'r.',markersize=100,alpha=.25) b1 = ax.plot(0,0,'b.',markersize=100,alpha=.25) legStrUp = 'probe in 100% of ' + str(nInstances) + ' UP instances' legStrDn = 'probe in 100% of ' + str(nInstances) + ' DN instances' plt.legend([r1, b1], [legStrUp, legStrDn], numpoints=1, loc=4) #plt.legdend([b1], ['probe in 100% of ' + str(nInstances) + 'instances' ], numpoints=1) ax.set_title(pert + ' dose = ' + dose1) fname = pert + '_' + dose1 + 'um_connection_qq.png' outf = os.path.join(work_dir,fname) plt.savefig(outf, bbox_inches=0)
def analyze_query(args,work_dir): ''' Analyze the output from query_tool - find self-connections and create graphs ''' #make a gct object db = gct.GCT() db.read(args.res) ##load query result - gctx file rslt = gct.GCT() #if specific result directory is specified, use that - otherwise get gctx from working dir if args.result: outGctx = glob.glob(os.path.join(work_dir, '*COMBINED*.gctx')) #select combined result gctx in working dir created from build_query step rslt.read(outGctx[0]) else: rslt.read(args.resultDir) rsltSigID = rslt.get_rids() #sig IDs from result file qPert = db.get_column_meta('pert_desc') qPertID = db.get_column_meta('pert_id') qDose = db.get_column_meta('pert_dose') ESmat = rslt.matrix iES = ESmat.argsort(axis=0)[::-1] #sort ascending n_inst = len(iES[:,1]) #loop through each of the perts - graph ranks of query prog1 = progress.DeterminateProgressBar('creating self-connection graphs') avRnk = [] medRnk = [] prRnk = [] #loop through each of the UNIQUE perts - graph ranks of query pertSet = set(qPert) for pert in pertSet: cmpd1 = pert iP = _all_indices(pert, qPert) #index of doses on plate if len(iP) < 2: print pert + ' has only one instance' continue uDose = [qDose[i] for i in iP] fDose = [float(x) for x in uDose] #convert strings to float aDose = numpy.asarray(fDose) #convert to numpy array iD = aDose.argsort() #local ordering sDose = [fDose[j] for j in iD] #sort local doses iPo = [iP[i] for i in iD] #ordered index qStr = qPertID[iPo[0]] #set pertID if len(qStr) >= 13: qStr = qStr[0:13] #shorten qPertID #run pymongo query CM = mutil.CMapMongo() #cmpdSigIds = CM.find({'pert_id':qStr},{'sig_id':True}) cmpdSigIds = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True}) #search for the BRD-xxxxxxxxxxx within the pert_id field in the db if len(cmpdSigIds) < 2: print cmpd1 + ' has one or no instances in the cmap database' continue #loop through each dose for d in iPo: #count probe enrichment and plot cmpd1 = qPert[d] dose1 = qDose[d] iE = iES[:,d] #ES sort index for one column sSigID = [] for y in iE: sSigID.append(rsltSigID[y]) #make sorted sig ID list i1 = [sSigID.index(y) for y in cmpdSigIds] #where instances of the compound of interest sit on the rank list i2 = numpy.array(i1) #convert list to numpy array avr = sum(i2)/len(i2) #what is the average ES rank md = numpy.median(i2) # what is the median ES rank nAv = float(avr)/n_inst #normalize acording to number of instances in db nMd = float(md)/len(iES[:,1]) #normalized median i1.sort() np = 1000 ntop = [x for x in i1 if x <= np] nPr = float(len(ntop))/(len(i1)) #percent of instances at the top of the list prRnk.append(nPr) avRnk.append(nAv) #store average ES rank medRnk.append(nMd) #plot fname = cmpd1 + '_' + dose1 + '_query_rank.png' outf = os.path.join(work_dir,fname) fig = plt.figure(figsize=(8.0, 2.0)) ax = fig.add_subplot(111) # the histogram of the data n, bins, patches = ax.hist(i2, 30, facecolor='green', alpha=0.75) #ax.set_xlim(0, n_inst) ax.set_xlim(0, int(round(n_inst,-5))) #round instances to nearest 100k ax.set_xlabel('query rank') ax.set_ylabel('freq') ax.set_title('dose = '+ str(dose1) +'um') ax.grid(True) plt.savefig(outf, bbox_inches=0)
os.mkdir(wkdir) cliqueGMT = gmt.read(gFile) cliqFrm = pd.DataFrame(cliqueGMT) # unstack nested list cliqMemberLong = [item for sublist in cliqFrm.sig.values for item in sublist] cliqMemb = list(set(cliqMemberLong)) # load summly matrix summMtrx = '/xchip/cogs/projects/connectivity/summly/matrices/matched_mrp4_n7147x7147.gctx' gt = gct.GCT() gt.read(summMtrx) summFrm = gt.frame ### get info on drug signatures MC = mu.CMapMongo() pertInfo = MC.find({'pert_id': { '$in': cliqMemb }}, { 'sig_id': True, 'cell_id': True, 'pert_id': True, 'pert_iname': True, 'is_gold': True }, toDataFrame=True) # tabulate signature stats pertGrped = pertInfo.groupby('pert_id') nDrugs = len(pertGrped.groups) Zs = np.zeros((nDrugs, 7))
# get directory dir1 = '/xchip/cogs/projects/TRIB1' wkdir = dir1 + '/TRIB1_analysis_Oct21' if not os.path.exists(wkdir): os.mkdir(wkdir) #define compounds of interest trib1Cps = ['BRD-K75627148', 'BRD-K35860134', 'BRD-K67774729', 'BRD-K16956545', 'BRD-K16410418'] ### 1 ) get signature info CM = mu.CMapMongo() qRes = CM.find({'pert_id':{'$in':trib1Cps}, 'is_gold' : True}, {'sig_id':True,'pert_iname':True,'pert_id':True,'pert_mfc_id':True,'cell_id':True,'pert_time':True,'is_gold':True}, toDataFrame=True) qRes.index = qRes['sig_id'] outF = wkdir + '/TRIB1_signature_details.txt' qRes.to_csv(outF,sep='\t',index=False) grped = qRes.groupby('pert_id') for grp in grped.groups: nSig = len(grped.groups[grp]) # print grp + ' ' + str(nSig) print str(nSig) ### descriptive data on signatures #number of cell lines per compound