def load_pcls(): "-load pcl groups \ limit groups to a set of pre-curated PCLS \ -return of pandas Series of these groups (index= group name, column is BRDs" classGMT = '/xchip/cogs/projects/pharm_class/pcl_shared_target_pid.gmt' gmtDict = gmt.read(classGMT) drugLabels = pd.DataFrame(gmtDict) drugLabels['id'] = drugLabels['id'].str.replace("/","_") drugLabels['id'] = drugLabels['id'].str.replace("-","_") drugLabels['id'] = drugLabels['id'].str.replace(" ","_") drugLabels['id'] = drugLabels['id'].str.replace("&","_") drugLabels['id'] = drugLabels['id'].str.replace("?","_") drugLabels['id'] = drugLabels['id'].str.replace("(","_") drugLabels['id'] = drugLabels['id'].str.replace(")","_") drugLabels['id'] = drugLabels['id'].str.replace("'","_") drugLabels['id'] = drugLabels.id.str.lower() # convert to lower case #load curated list of groups curatedFile = '/xchip/cogs/hogstrom/analysis/scratch/pcl_keepers_mod_currated.txt' curFrm = pd.read_csv(curratedFile,header=None) curFrm.columns = ['curated_groups'] dlSer = pd.Series(data=drugLabels['sig']) dlSer.index = drugLabels['id'] #which of the curated groups are have pairings isCur = curFrm.curated_groups.isin(dlSer.index) curGroups = dlSer.reindex(curFrm.ix[isCur,'curated_groups']) return curGroups
def load_clique_set_n69(self): ''' load drug label set currated by Rajiv ''' #load in clique annotations and matrix cFile = '/xchip/cogs/sig_tools/sig_cliquescore_tool/sample/cp_clique_n69/clique.gmt' cliqueGMT = gmt.read(cFile) cliqFrm = pd.DataFrame(cliqueGMT) # set grouping structures pclDict = {} for x in cliqFrm.iterrows(): pclDict[x[1]['id']] = set(x[1]['sig']) return pclDict
def load_clique_set_n69(self): ''' load drug label set currated by Rajiv ''' #load in clique annotations and matrix cFile = '/xchip/cogs/sig_tools/sig_cliquescore_tool/sample/cp_clique_n69/clique.gmt' cliqueGMT = gmt.read(cFile) cliqFrm = pd.DataFrame(cliqueGMT) # set grouping structures pclDict = {} for x in cliqFrm.iterrows(): pclDict[x[1]['id']] = set(x[1]['sig']) return pclDict
### add lines for gct headers line_pre_adder(outFile,str(mtrx.shape[0])+'\t'+str(mtrx.shape[1]-1)) line_pre_adder(outFile,"#1.2") ### make gmts of gene shRNAs geneGrped = annt.groupby('pert_id') gmtList = [] for grp in geneGrped: gmtDictUp = {} gmtDictUp['id'] = grp[0] gmtDictUp['desc'] = grp[0] gmtDictUp['sig'] = list(grp[1].sig_id.values) gmtList.append(gmtDictUp) # gmtOut = wkdir + '/gene_shRNA_sig_id.gmt' gmtOut = wkdir + '/gene_oe_sig_id.gmt' gmt.write(gmtList,gmtOut) ### load core drivers - save sig_ids to new gmt gFile= wkdir + '/core_lung_drivers.gmt' coreGMT = gmt.read(gFile) coreOE = coreGMT['sig'] coreFrm = annt[annt.pert_id.isin(coreOE)] sig_ids = list(coreFrm.sig_id.values) gmtDict = {} gmtDict['id'] = 'core_lung_drivers' gmtDict['desc'] = 'core_lung_drivers' gmtDict['sig'] = sig_ids gmtOut = wkdir + '/core_lung_drivers_sig_id.gmt' gmt.write([gmtDict],gmtOut)
import copy from matplotlib import cm from statsmodels.distributions import ECDF from cmap.analytics.statsig import ConnectivitySignificance from cmap.io import gct from cmap.io import gmt import cmap.util.progress as update wkdir = '/xchip/cogs/projects/connectivity/null/clique_analysis/clique_vs_dmso_null' if not os.path.exists(wkdir): os.mkdir(wkdir) # load cliques classGMT = '/xchip/cogs/projects/pharm_class/cp_cliques_current.gmt' gmtDict = gmt.read(classGMT) cliqueLabels = pd.DataFrame(gmtDict) # create set of all clique members cList = [item for sublist in cliqueLabels['sig'] for item in sublist] cSet = set(cList) # load observed score data # thresholded # rFile = '/xchip/cogs/projects/connectivity/null/clique_analysis/dmso_q_thresholded_asym_lass_matrix/jan28/my_analysis.sig_cliqueselect_tool.2014012814320559/summly/self_rankpt_n379x379.gctx' # non-thresholded asym rFile = '/xchip/cogs/projects/connectivity/null/clique_analysis/baseline_lass_asym_matrix/jan28/my_analysis.sig_cliqueselect_tool.2014012814364180/summly/self_rankpt_n379x379.gctx' gt1 = gct.GCT() gt1.read(rFile) sFrm = gt1.frame sFrm.columns = gt1.get_column_meta('pert_id') #check that all clique members are in the observed matrix
sigInfoFrm['labels'] = np.nan sigInfoFrm['pcl_name'] = 'null' for igroup,group in enumerate(test_groups): grpMembers = pclDict[group] iMatch = sigInfoFrm['pert_id'].isin(grpMembers) sigInfoFrm['labels'][iMatch] = igroup sigInfoFrm['pcl_name'][iMatch] = group return sigInfoFrm wkdir = '/xchip/cogs/projects/NMF/clique_n69_all_cell_lines' if not os.path.exists(wkdir): os.mkdir(wkdir) #load in clique annotations and matrix cFile = '/xchip/cogs/sig_tools/sig_cliquescore_tool/sample/cp_clique_n69/clique.gmt' cliqueGMT = gmt.read(cFile) cliqFrm = pd.DataFrame(cliqueGMT) # set grouping structures pclDict = {} for x in cliqFrm.iterrows(): pclDict[x[1]['id']] = set(x[1]['sig']) # brdAllGroups = [] for group in pclDict: brdAllGroups.extend(pclDict[group]) brdAllGroups.append('DMSO') brdAllGroups = list(set(brdAllGroups)) testGroups = cliqFrm['id'].values # extract signatures and expression data for every group member
mtrx.to_csv(outFile, sep='\t') ### add lines for gct headers line_pre_adder(outFile, str(mtrx.shape[0]) + '\t' + str(mtrx.shape[1] - 1)) line_pre_adder(outFile, "#1.2") ### make gmts of gene shRNAs geneGrped = annt.groupby('pert_id') gmtList = [] for grp in geneGrped: gmtDictUp = {} gmtDictUp['id'] = grp[0] gmtDictUp['desc'] = grp[0] gmtDictUp['sig'] = list(grp[1].sig_id.values) gmtList.append(gmtDictUp) # gmtOut = wkdir + '/gene_shRNA_sig_id.gmt' gmtOut = wkdir + '/gene_oe_sig_id.gmt' gmt.write(gmtList, gmtOut) ### load core drivers - save sig_ids to new gmt gFile = wkdir + '/core_lung_drivers.gmt' coreGMT = gmt.read(gFile) coreOE = coreGMT['sig'] coreFrm = annt[annt.pert_id.isin(coreOE)] sig_ids = list(coreFrm.sig_id.values) gmtDict = {} gmtDict['id'] = 'core_lung_drivers' gmtDict['desc'] = 'core_lung_drivers' gmtDict['sig'] = sig_ids gmtOut = wkdir + '/core_lung_drivers_sig_id.gmt' gmt.write([gmtDict], gmtOut)
'A375', 'A549', 'HA1E', 'HCC515', 'HEPG2', 'HT29', 'MCF7', 'PC3', 'VCAP' ] # cmap 'core' cell lines basedir = '/xchip/cogs/projects/NMF/NMF_drug_shRNA' # drug_gene_list = '/xchip/cogs/projects/target_id/drug_gene_connections_20Mar2014/expected_drug_gene_connection_ranks.txt' # dg = pd.read_csv(drug_gene_list,sep='\t') # dg_connected = dg[dg.connection_rank <= 10] cpd_targets_n368_file = '/xchip/cogs/sig_tools/sig_cliqueselect_tool/sample/cpd_targets_n368/summly/self_connectivity.txt' n368 = pd.read_csv(cpd_targets_n368_file, sep='\t') median_rnkpt_thresh = 73 cp_connected = n368[n368.median_rankpt >= median_rnkpt_thresh] #load in clique annotations and matrix cFile = '/xchip/cogs/projects/pharm_class/rnwork/cliques/cpd_targets_n368.gmt' cliqueGMT = gmt.read(cFile) cliqFrm = pd.DataFrame(cliqueGMT) # limit only to drug-gene groups that have coherence cliqFrm = cliqFrm[cliqFrm.id.isin(cp_connected.group_id)] # write a new, shorter gmt file gmtUpdate = [x for x in cliqueGMT if x['desc'] in cliqFrm.desc.values] outF = basedir + '/n69_drug_targets.gmt' gmt.write(gmtUpdate, outF) ### set parameters probeSpace = 'lm_epsilon' # lm_epsilon or bing nDMSO = 50 nKeep = 2 # number of signatures per drug for cell in cellList: print(cell)
# run through MSIG_DB ### EMT questions # 1) enrichment for any apirori genes # 2) stability across multiple signatures or TFs # 3) any drug class enrichment? # run up/dn signatures through MSIGDB - do they capture other emt signatures? # load kegg pathways file_kegg = '/xchip/cogs/hogstrom/bathe/gordonov/c2.cp.kegg.v4.0.symbols.gmt' gt = gmt.read(file_kegg) keggFrm = pd.DataFrame(gt) GeneList = keggFrm[keggFrm.id == 'KEGG_REGULATION_OF_ACTIN_CYTOSKELETON'].sig.values GeneList = list(GeneList[0]) ### aprioriList = ['RAC1', 'CDC42', 'RHOA', 'ROCK1', 'RICS', 'RHOA', 'PRKCA', 'PIK3CA', 'ARPC1A', 'MAPK',
### shRNA # anntFrm = pd.read_csv(aFile,sep='\t',index_col=1) #,header=True) # headers= ['sig2','pert_id'] anntFrm.columns = headers anntFrm.index.name = 'sig1' # drop extra rows anntFrm = anntFrm[anntFrm.index.isin(Hmtrx.index)] # leave out annotations not in matrix ### read in mutual information matrices mFile = sourceDir + '/' + prefix + '/' + modzPrefix + '.MI.input_space.gct' mi = pd.read_csv(mFile,sep='\t',skiprows=[0,1],index_col=0) #,header=True mi = mi.drop('Description',1) cFile = sourceDir + '/' + prefix + '/' + modzPrefix + '.MI.k' + str(nComponents) + '.gct' cmi = pd.read_csv(cFile,sep='\t',skiprows=[0,1],index_col=0) #,header=True cmi = cmi.drop('Description',1) ### load in clique annotations and matrix cliqueGMT = gmt.read(gmtFile) cliqFrm = pd.DataFrame([cliqueGMT]) ######################################### ### graph individual group components ### ######################################### group_component_maps(Hmtrx,cliqFrm,graphDir) # # ############################## # # ### top component analysis ### # # ############################## # # take the mean of the top 3 components for each group member # topMeanFrm = combine_group_top_components(Hmtrx,cliqFrm,metric='mean') # # ############################## # # ### build null distribution ## # # ############################## # # repeate metric - but shuffle signatures from groups of equal size # nullMean = build_combine_null(Hmtrx,cliqFrm,topMeanFrm,nTop=3,nPerm=4000)
# run through MSIG_DB ### EMT questions # 1) enrichment for any apirori genes # 2) stability across multiple signatures or TFs # 3) any drug class enrichment? # run up/dn signatures through MSIGDB - do they capture other emt signatures? # load kegg pathways file_kegg = '/xchip/cogs/hogstrom/bathe/gordonov/c2.cp.kegg.v4.0.symbols.gmt' gt = gmt.read(file_kegg) keggFrm = pd.DataFrame(gt) GeneList = keggFrm[keggFrm.id == 'KEGG_REGULATION_OF_ACTIN_CYTOSKELETON'].sig.values GeneList = list(GeneList[0]) ### aprioriList = ['RAC1', 'CDC42', 'RHOA', 'ROCK1', 'RICS', 'RHOA', 'PRKCA', 'PIK3CA', 'ARPC1A', 'MAPK',
import copy from matplotlib import cm from statsmodels.distributions import ECDF from cmap.analytics.statsig import ConnectivitySignificance from cmap.io import gct from cmap.io import gmt import cmap.util.progress as update wkdir = '/xchip/cogs/projects/connectivity/null/clique_analysis/clique_vs_dmso_null' if not os.path.exists(wkdir): os.mkdir(wkdir) # load cliques classGMT = '/xchip/cogs/projects/pharm_class/cp_cliques_current.gmt' gmtDict = gmt.read(classGMT) cliqueLabels = pd.DataFrame(gmtDict) # create set of all clique members cList = [item for sublist in cliqueLabels['sig'] for item in sublist] cSet = set(cList) # load observed score data # thresholded # rFile = '/xchip/cogs/projects/connectivity/null/clique_analysis/dmso_q_thresholded_asym_lass_matrix/jan28/my_analysis.sig_cliqueselect_tool.2014012814320559/summly/self_rankpt_n379x379.gctx' # non-thresholded asym rFile = '/xchip/cogs/projects/connectivity/null/clique_analysis/baseline_lass_asym_matrix/jan28/my_analysis.sig_cliqueselect_tool.2014012814364180/summly/self_rankpt_n379x379.gctx' gt1 = gct.GCT() gt1.read(rFile) sFrm = gt1.frame sFrm.columns = gt1.get_column_meta('pert_id') #check that all clique members are in the observed matrix
import matplotlib from cmap.analytics.pert_explorer import PertExplorer from cmap.analytics.cluster import HClust import cmap.io.gmt as gmt from matplotlib import cm from cmap.analytics.queryer import Queryer import cmap.analytics.gppa as gppa from cmap.io import queryresult wkdir = '/xchip/cogs/projects/target_id/pathway_clustering/lfcg_gppa_KD_spearman' if not os.path.exists(wkdir): os.makedirs(wkdir) #pathway annotations from reactome pathGMT = '/xchip/cogs/projects/target_id/KD_pathway_clustering/ReactomePathways.gmt' gmtDict = gmt.read(pathGMT) pathwayDict = {} for dict1 in gmtDict: pathwayDict[dict1['id']] = dict1['sig'] # aprioriPathways = ['Cholesterol biosynthesis', # 'p53-Dependent G1 DNA Damage Response', # 'p53-Dependent G1/S DNA damage checkpoint', # 'Antigen processing: Ubiquitination & Proteasome degradation', # 'Regulation of activated PAK-2p34 by proteasome mediated degradation', # 'Signaling by TGF-beta Receptor Complex', # 'mTOR signalling'] # Lessons from the cancer genome pathways: lfcgPathways = [ 'p38MAPK events', 'ERK/MAPK targets',
for i,x in enumerate(pInames): pInameType.append(pInames[i]+ '.' +pType[i]) anntFrm = pd.DataFrame({'pert_id':pIDs,'pert_type':pType,'pert_iname':pInames},index=pInameType) sigSer = pd.Series(index=summFrm.index, data=summFrm.columns) outGRP = wkdir + '/summly_matched_ids.grp' sigSer.to_csv(outGRP,index=False) #################### ### load cliques ### #################### # groupGMT = '/xchip/cogs/projects/pharm_class/rnwork/cliques/cpd_groups_n147.gmt' groupGMT = '/xchip/cogs/sig_tools/sig_cliqueselect_tool/sample/pcl_20140213/cliques.gmt' # groupGMT = '/xchip/cogs/sig_tools/sig_cliqueselect_tool/sample/pcl_20140221/cliques.gmt' cliqueGMT = gmt.read(groupGMT) cliqFrm = pd.DataFrame(cliqueGMT) cliqFrm['group_size'] = cliqFrm.sig.apply(len) cliqFrm.index = cliqFrm['desc'] cliqFrm['Name'].str.replace("/","-") # which compounds are clique members vs. non-members cliqMemberLong = [item for sublist in cliqFrm.sig.values for item in sublist] cliqMemb = list(set(cliqMemberLong)) isMemb = anntFrm.pert_id.isin(cliqMemb) isCp = anntFrm.pert_type == 'trt_cp' nonMemb = anntFrm[isCp & ~isMemb].index.values nonMid = anntFrm[isCp & ~isMemb].pert_id.values ######################################### ### load sig_cliquescore_tool results ###
import matplotlib from cmap.analytics.pert_explorer import PertExplorer from cmap.analytics.cluster import HClust import cmap.io.gmt as gmt from matplotlib import cm from cmap.analytics.queryer import Queryer import cmap.analytics.gppa as gppa from cmap.io import queryresult wkdir = '/xchip/cogs/projects/target_id/pathway_clustering/lfcg_gppa_KD_spearman' if not os.path.exists(wkdir): os.makedirs(wkdir) #pathway annotations from reactome pathGMT = '/xchip/cogs/projects/target_id/KD_pathway_clustering/ReactomePathways.gmt' gmtDict = gmt.read(pathGMT) pathwayDict = {} for dict1 in gmtDict: pathwayDict[dict1['id']] = dict1['sig'] # aprioriPathways = ['Cholesterol biosynthesis', # 'p53-Dependent G1 DNA Damage Response', # 'p53-Dependent G1/S DNA damage checkpoint', # 'Antigen processing: Ubiquitination & Proteasome degradation', # 'Regulation of activated PAK-2p34 by proteasome mediated degradation', # 'Signaling by TGF-beta Receptor Complex', # 'mTOR signalling'] # Lessons from the cancer genome pathways: lfcgPathways = ['p38MAPK events', 'ERK/MAPK targets',