def _get_one_drug_data(self, name, randomize_Y=False): """Returns X and Y for a given drug, dropping NAs :param name: drug name :param randomize_Y: randomize Y - drops NA - drops TISSUE_FACTOR - drops MSI factor """ Y = self.ic50.df[name] Y.dropna(inplace=True) X = self.features.df.ix[Y.index].copy() try: X = X.drop('TISSUE_FACTOR', axis=1) except: pass try: X = X.drop('MSI_FACTOR', axis=1) except: pass if self.scale is True: columns = X.columns # cast is essential here otherwise ValueError is raised X = preprocessing.scale(X.astype(float)) X = pd.DataFrame(X, columns=columns) if randomize_Y: Y = Y.copy() pylab.shuffle(Y.values) return X, Y
def _get_one_drug_data(self, name, randomize_Y=False): """Returns X and Y for a given drug, dropping NA :param name: drug name :param randomize_Y: randomize Y - drops NA - drops TISSUE_FACTOR - drops MSI factor """ Y = self.ic50.df[name] Y.dropna(inplace=True) X = self.features.df.ix[Y.index].copy() try:X = X.drop('TISSUE_FACTOR', axis=1) except:pass try: X = X.drop('MSI_FACTOR', axis=1) except:pass if self.scale is True: columns = X.columns # cast is essential here otherwise ValueError is raised X = preprocessing.scale(X.astype(float)) X = pd.DataFrame(X, columns=columns) if randomize_Y: Y = Y.copy() pylab.shuffle(Y.values) return X, Y
def selector(self, df, Nbest=1000, Nrandom=1000, inplace=False): """Select only the first N best rows and N random ones Sometimes, there are tens of thousands of associations and future analysis will include more features and drugs. Plotting volcano plots should therefore be fast and scalable. Here, we provide a naive way of speeding up the plotting by selecting only a subset of the data made of Nbest+Nrandom associations. :param df: the input dataframe with ANOVAResults :param int Nbest: how many of the most significant association should be kept :param int Nrandom: on top of the Nbest significant association, set how many other randomly chosen associations are to be kept. :return: pruned dataframe """ if len(df) < Nbest: return df Nmax = Nbest + Nrandom N = len(df) if N > Nbest: x = range(Nbest, N) pylab.shuffle(x) n2pick = min(N, Nmax) - Nbest indices = range(0, Nbest) + x[0:n2pick] else: indices = range(0, Nbest) # indices in the index may not be order indices = [df.index[xx] for xx in indices] df = df.ix[indices] if inplace is True: self.df = df else: return df
def aggregate_submissions_random(self, N=10): """Aggregate N submissions by picking a method called :meth:`_aggregate` must be defined in the child class """ # select N random submissions indices = list(range(0, len(self.df.index))) pylab.shuffle(indices) indices = indices[0:N] aggregate = self._aggregate(indices) return aggregate
def aggregate_submissions_random(self, N=10): """Aggregate N submissions by picking a method called :meth:`_aggregate` must be defined in the child class """ # select N random submissions indices = range(0, len(self.df.index)) pylab.shuffle(indices) indices = indices[0:N] aggregate = self._aggregate(indices) return aggregate
def start_trial(self): """The only funky thing going on here is that we need to call self.block_sequence() to obtain a generator.""" self.in_trial = True self.current_block = 0 self.showing_sequence = True bc = self.span idx = pylab.arange(5*5) #we work on a 5,5 square pylab.shuffle(idx) self.this_sequence = idx[:bc] #Sequence of blocks self.bs_gen = self.block_sequence() #Need to convert it into a generator self.block_h = [] self.event_source = self.fig.canvas.new_timer(interval=1000) self.event_source.add_callback(self.draw_sequence_frame) self.clear_main_screen() self.event_source.start()
def start_trial(self): """The only funky thing going on here is that we need to call self.block_sequence() to obtain a generator.""" self.in_trial = True self.current_block = 0 self.showing_sequence = True bc = self.span idx = pylab.arange(5 * 5) #we work on a 5,5 square pylab.shuffle(idx) self.this_sequence = idx[:bc] #Sequence of blocks self.bs_gen = self.block_sequence( ) #Need to convert it into a generator self.block_h = [] self.event_source = self.fig.canvas.new_timer(interval=1000) self.event_source.add_callback(self.draw_sequence_frame) self.clear_main_screen() self.event_source.start()
def anova_all(self, animate=True, drugs=None): """Run all ANOVA tests for all drugs and all features. :param drugs: you may select a subset of drugs :param animate: shows the progress bar :return: an :class:`~gdsctools.anova_results.ANOVAResults` instance with the dataframe stored in an attribute called **df** Loops over all drugs calling :meth:`anova_one_drug` for each drug and concatenating all results together. Note that once all data are gathered, an extra column containing the FDR corrections is added to the dataframe using :meth:`add_pvalues_correction` method. An extra column named "ASSOC_ID" is also added with a unique identifer sorted by ascending FDR. .. note:: A thorough comparison with version v17 give the same FDR results (difference ~1e-6); Note however that the qvalue results differ by about 0.3% due to different smoothing in R and Python. """ # drop DRUG where number of IC50 (non-null) is below 5 # axis=0 is default but we emphasize that sum is over # column (i.e. drug vv = (self.ic50.df.isnull() == False).sum(axis=0) # FIXME: should be in one_drug_one_feature ?? drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50] # if user provided a list of drugs, use them: if drugs is not None: # todo: check valifity of the drug names drug_names = drugs[:] pb = Progress(len(drug_names), 1) drug_names = list(drug_names) pylab.shuffle(drug_names) if animate is True: pb.animate(0) for i, drug_name in enumerate(drug_names): if drug_name in self.individual_anova.keys(): pass else: res = self.anova_one_drug(drug_name, animate=False, output='dataframe') self.individual_anova[drug_name] = res if animate is True: pb.animate(i+1) print("\n") if len(self.individual_anova) == 0: return ANOVAResults() df = pd.concat(self.individual_anova, ignore_index=True) if len(df) == 0: return df # sort all data by ANOVA p-values try: df.sort_values('ANOVA_FEATURE_pval', inplace=True) except: df.sort('ANOVA_FEATURE_pval', inplace=True) # all ANOVA have been computed individually for each drug and each # feature. Now, we need to compute the multiple testing corrections if self.settings.pvalue_correction_level == 'global': df = self.add_pvalues_correction(df) # insert a unique identifier as first column df.insert(0, 'ASSOC_ID', range(1, len(df) + 1)) self.df = df # order the column names as defined in the __init__ method df = df[self.column_names] df.reset_index(inplace=True, drop=True) results = ANOVAResults() results.df = df results.settings = ANOVASettings(**self.settings) return results
for j in range(L): l[Grid[i, j] - 1] += 1 return l #core program L = 400 r = rand(L, L) p = 0.59275 z = r < p lw, num = measurements.label(z) c = np.arange(lw.max() + 1) # shuffle(c); # (optional) count = ClusterCounter(lw, L, num) #plotting tools #plot size distribution s = np.linspace(0, 100) plt.plot(s, 2500 * 1 / (s**(187 / 91))) plt.hist(count, bins=50, range=(0, 100)) plt.show() #plot field a = np.copy(c) shuffle(a) fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(111) im = ax.imshow(c[lw], cmap=plt.cm.jet, interpolation='none') ax.set_axis_off() plt.show()
### reading catalogs eagle_dir = '../data/eagle_simulations' cats = [ mypy.readcat('%s/snap%02i.dat' % (eagle_dir, snp.id)) for snp in snapshots ] ### reading overdensity maps vmaps = [ fits.getdata('%s/medVoronoi.overdens.%02i.fits' % (eagle_dir, snp.id)) for snp in snapshots ] ### indices of galaxies to track inds0 = pylab.find((cats[0].stellarMass)) pylab.shuffle(inds0) x_all = pylab.zeros((len(inds0), len(snapshots))) y_all = pylab.zeros((len(inds0), len(snapshots))) x_min = pylab.zeros(len(inds0)) y_min = pylab.zeros(len(inds0)) x_max = pylab.zeros(len(inds0)) y_max = pylab.zeros(len(inds0)) lmass_all = pylab.zeros((len(inds0), len(snapshots))) overdens_all = pylab.zeros((len(inds0), len(snapshots))) t0 = time.time() for i, ind0 in enumerate(inds0):
def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory='.'): """Compute ANOVA and various tests on one drug and one feature :param drug_id: a valid drug identifier :param feature_name: a valid feature name :param bool show: show some plots :param str directory: where to save the figure. :param bool production: if False, returns a dataframe otherwise a dictionary. This is to speed up analysis when scanning the drug across all features. .. note:: **for developer** this is the core of tha analysis and should be kept as fast as possible. 95% of the time is spent here. .. note:: **for developer** Data used in this function comes from _get_one_drug_one_feature_data method, which should also be kept as fast as possible. """ if drug_id not in self.drugIds: raise ValueError('Unknown drug name %s. Use e.g., %s' % (drug_id, self.drugIds[0])) if feature_name not in self.feature_names: # we start index at 3 to skip tissue/name/msi raise ValueError('Unknown feature name %s. Use e.g. one of %s' % (feature_name, self.feature_names[0:3])) # This extract the relevant data and some simple metrics # This is now pretty fast accounting for 45 seconds # for 265 drugs and 988 features odof = self._get_one_drug_one_feature_data(drug_id, feature_name) drug_name = self.drug_decode.get_name(drug_id) drug_target = self.drug_decode.get_target(drug_id) # if the status is False, it means the number of data points # in a category (e.g., positive feature) is too low. # If so, nothing to do, we return an 'empty' dictionary if odof.status is False: results = self._odof_dict.copy() results['FEATURE'] = feature_name results['DRUG_ID'] = drug_id results['DRUG_NAME'] = drug_name results['DRUG_TARGET'] = drug_target results['N_FEATURE_pos'] = odof.Npos results['N_FEATURE_neg'] = odof.Nneg if production is True: # return a dict return results else: # or a dataframe; note that index is not relevant here but # required. df = pd.DataFrame(results, index=[1]) return df # with the data extract, we can now compute the regression. # In R or statsmodels, the regression code is simple since # it is based on the formula notation (Y~C(msi)+feature) # This is also possible in statsmodels library, however, # this relies on patsy, which is very slow as compared to the # statsmodels without formula. #### self._mydata = pd.DataFrame({'Y':self.Y, #### 'tissue':self.masked_tissue, #### 'msi': self.masked_msi, 'feature':self.masked_features}) #### self.data_lm = ols('Y ~ C(tissue) + C(msi) + feature', #### data=self._mydata, missing='none').fit() #Specify C is category # IMPORTANT: the order of the factors in the formula # is important. It does not change the total sum of square errors # but may change individual effects of the categorical # components. # Instead of using ols function, we use the OLS one so we cannot # use formula. Instead, we need to create manually the input # data. In the case of categorical data (tissue), we need to # create the dummy variable, which is done in the constructor # once for all (slow otherwise). if self.settings.analysis_type == 'PANCAN': # IMPORTANT: tissues are sorted alphabetically in R aov # function. Same in statsmodels but capitalised names # are sorted differently. In R, a<b<B<c but in Python, # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R, # not in python. Since in a linear regression # models, the order of the factor matters and the first # factor is used as a reference, we decided to use same # convention as in R. # see http://statsmodels.sourceforge.net/devel/contrasts.html # for a good explanation #self._mydata = pd.DataFrame({'Y': odof.Y.copy(), # 'tissue':odof.masked_tissue, # 'msi': odof.masked_msi, 'feature': odof.masked_features}) #self.data_lm2 = ols('Y ~ C(tissue) + C(msi) + feature', # data=self._mydata).fit() #Specify C for Categorical # from statsmodels.stats.anova import anova_lm # import statsmodels.formula.api as smf # df = pd.DataFrame({'Y': odof.Y.copy(), # 'tissue':odof.masked_tissue,'media' # odof.masked_media, 'msi': odof.masked_msi, # 'feature': odof.masked_features}) # lm = smf.ols('Y~C(tissue)+C(media)+C(msi)+feature', # data=df).fit() # anova_lm(lm) # The code above gives same answer as the code in gdsctools # but is slower # We could use pd.get_dummies but pretty slow # instead we create the full matrix in init() method. # One issue is that some columns end up with sum == 0 # and needs to be dropped. df = self._tissue_dummies.ix[odof.masked_tissue.index] todrop = df.columns[df.values.sum(axis=0) == 0] if len(todrop) > 0: # use if since drop() is slow df = df.drop(todrop, axis=1) # Here we set other variables with dataframe columns' names as # expected by OLS. if self.settings.include_media_factor == False: todrop = [x for x in df.columns if x.startswith('C(media)')] df = df.drop(todrop, axis=1) df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features.values self.Y = odof.Y self.EV = df.values # The regression and anova summary are done here # """if self.settings.regression_method == 'ElasticNet': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=self.settings.regression_L1_wt) elif self.settings.regression_method == 'OLS': self.data_lm = OLS(odof.Y, df.values).fit() elif self.settings.regression_method == 'Ridge': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=0) elif self.settings.regression_method == 'Lasso': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=1) """ # example of computing null model ? # Example of computing pvalues ourself # with 100 000 samples, we can get a smooth distribution # that we can then fit with fitter. good distribution # for the raw data is uniform one but if we take the log10, # we have lots of possible distrob such as beta, exponweib, gamma, #.... elif self.settings.include_MSI_factor is True: #self._mydata = pd.DataFrame({'Y': odof.Y, # 'msi': odof.masked_msi, 'feature': odof.masked_features}) #self.data_lm = ols('Y ~ C(msi) + feature', # data=self._mydata).fit() #Specify C for Categorical df = pd.DataFrame() df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features.values df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg)) #self.data_lm = OLS(odof.Y, df.values).fit() else: df = pd.DataFrame() df['feature'] = odof.masked_features.values df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg)) #self.data_lm = OLS(odof.Y, df.values).fit() #self._mydata = pd.DataFrame({'Y': odof.Y, # 'feature': odof.masked_features}) #self.data_lm = ols('Y ~ feature', # data=self._mydata).fit() #Specify C for Categorical if self.settings.regression_method == 'ElasticNet': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=self.settings.regression_L1_wt) elif self.settings.regression_method == 'OLS': self.data_lm = OLS(odof.Y, df.values).fit() elif self.settings.regression_method == 'Ridge': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=0) elif self.settings.regression_method == 'Lasso': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=1) key = drug_id + "__" + feature_name if self.sampling and key not in self.pvalues_features.keys(): # This can be computed for a drug once for all # no need to redo it for each feature ? # If the length of Y is too small (e.g., < 20) the results may not be # great. This can be check zith the errors self.samples1 = [] self.samples2 = [] self.samples3 = [] Y = odof.Y.copy() N = self.sampling pb = Progress(N, 20) for i in range(0, N): # To get the random distribution, shuffle Y # and noise not required # To get the noise effects, do not shuffle and set noise to # something different from 0 noise = 0.0 pylab.shuffle(Y) #data_lm = OLS(Y, df.values).fit() data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit() anova_pvalues = self._get_anova_summary(data_lm, output='dict') try: self.samples1.append(anova_pvalues['msi']) except: pass self.samples2.append(anova_pvalues['feature']) try: self.samples3.append(anova_pvalues['tissue']) except: pass #pb.animate(i+1) import fitter ff = fitter.Fitter(-pylab.log10(self.samples2)) dist = "genexpon" ff.distributions = [dist] ff.fit() self.pvalues_features[key] = { 'error': ff.df_errors.ix[dist].values[0], 'params': ff.fitted_param[dist], 'feature': feature_name, 'N': len(Y) } print(self.pvalues_features[key]) self.anova_pvalues = self._get_anova_summary(self.data_lm, output='dict') # Store the pvalues. Note that some may be missing so we use try # except, which is faster than if/else try: tissue_PVAL = self.anova_pvalues['tissue'] except: tissue_PVAL = None try: MSI_PVAL = self.anova_pvalues['msi'] except: MSI_PVAL = None try: FEATURE_PVAL = self.anova_pvalues['feature'] except: FEATURE_PVAL = None try: MEDIA_PVAL = self.anova_pvalues['media'] except: MEDIA_PVAL = None if show is True: boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory) boxplot.boxplot_association(fignum=1) # a boxplot to show cell lines effects. This requires # the settings.analyse_type to be PANCAN if self.settings.analysis_type == 'PANCAN': boxplot.boxplot_pancan(fignum=2, mode='tissue') if self.settings.include_MSI_factor: boxplot.boxplot_pancan(fignum=3, mode='msi') results = { 'FEATURE': feature_name, 'DRUG_ID': drug_id, 'DRUG_NAME': drug_name, 'DRUG_TARGET': drug_target, 'N_FEATURE_pos': odof.Npos, 'N_FEATURE_neg': odof.Nneg, 'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean, 'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean, 'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50, 'FEATURE_pos_IC50_sd': odof.pos_IC50_std, 'FEATURE_neg_IC50_sd': odof.neg_IC50_std, 'FEATURE_IC50_effect_size': odof.effectsize_ic50, 'FEATURE_pos_Glass_delta': odof.pos_glass, 'FEATURE_neg_Glass_delta': odof.neg_glass, 'ANOVA_FEATURE_pval': FEATURE_PVAL, 'ANOVA_TISSUE_pval': tissue_PVAL, 'ANOVA_MSI_pval': MSI_PVAL, 'ANOVA_MEDIA_pval': MEDIA_PVAL, 'FEATURE_IC50_T_pval': odof.ttest # pvalues is in index 1 } # 12% of the time here if production is True: return results else: df = pd.DataFrame(results, index=[1]) return df
def anova_all(self, animate=True, drugs=None): """Run all ANOVA tests for all drugs and all features. :param drugs: you may select a subset of drugs :param animate: shows the progress bar :return: an :class:`~gdsctools.anova_results.ANOVAResults` instance with the dataframe stored in an attribute called **df** Loops over all drugs calling :meth:`anova_one_drug` for each drug and concatenating all results together. Note that once all data are gathered, an extra column containing the FDR corrections is added to the dataframe using :meth:`add_pvalues_correction` method. An extra column named "ASSOC_ID" is also added with a unique identifer sorted by ascending FDR. .. note:: A thorough comparison with version v17 give the same FDR results (difference ~1e-6); Note however that the qvalue results differ by about 0.3% due to different smoothing in R and Python. """ # drop DRUG where number of IC50 (non-null) is below 5 # axis=0 is default but we emphasize that sum is over # column (i.e. drug vv = (self.ic50.df.isnull() == False).sum(axis=0) # FIXME: should be in one_drug_one_feature ?? drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50] # if user provided a list of drugs, use them: if drugs is not None: # todo: check valifity of the drug names drug_names = drugs[:] pb = Progress(len(drug_names), 1) drug_names = list(drug_names) pylab.shuffle(drug_names) if animate is True: pb.animate(0) for i, drug_name in enumerate(drug_names): if drug_name in self.individual_anova.keys(): pass else: res = self.anova_one_drug(drug_name, animate=False, output='dataframe') self.individual_anova[drug_name] = res if animate is True: pb.animate(i + 1) print("\n") if len(self.individual_anova) == 0: return ANOVAResults() df = pd.concat(self.individual_anova, ignore_index=True) if len(df) == 0: return df # sort all data by ANOVA p-values try: df.sort_values('ANOVA_FEATURE_pval', inplace=True) except: df.sort('ANOVA_FEATURE_pval', inplace=True) # all ANOVA have been computed individually for each drug and each # feature. Now, we need to compute the multiple testing corrections if self.settings.pvalue_correction_level == 'global': df = self.add_pvalues_correction(df) # insert a unique identifier as first column df.insert(0, 'ASSOC_ID', range(1, len(df) + 1)) self.df = df # order the column names as defined in the __init__ method df = df[self.column_names] df.reset_index(inplace=True, drop=True) results = ANOVAResults() results.df = df results.settings = ANOVASettings(**self.settings) return results
def mc_trajectory(self, distribution) : counts = multinomial(self.frames, distribution) samples = array( tuple( self.mc_counts2samples(counts) ) ) shuffle(samples) return samples
import pylab as pyl import cPickle as pickle galaxies = pickle.load(open('./galaxies.pickle','rb')) galaxies = filter(lambda galaxy: galaxy.ston_I > 30., galaxies) mergers = pyl.asarray([galaxy.Merger for galaxy in galaxies]) icd = pyl.asarray([galaxy.ICD_IH*100 for galaxy in galaxies]) stack = pyl.column_stack((mergers, icd)) result =[] for i in range(10000): # shuffle the icd values pyl.shuffle(stack[:,1]) #get the high ICD ones gt = pyl.where(stack[:,1] > 20) # are they mergers? y = stack[:,0][gt] #how many mergers? m = len(y.nonzero()[0]) #what percentage? per = m/float(len(gt[0])) # save that percentage result.append(per)
def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory='.', fontsize=18): """Compute ABOVA one drug and one feature level :param drug_id: a valid drug identifier :param feature_name: a valid feature name :param bool show: show boxplots with the different factor used :param str directory: where to save the figure. :param bool production: if False, returns a dataframe otherwise a dictionary. This is to speed up analysis when scanning the drug across all features. .. note:: **for developer** this is the core of the analysis and should be kept as fast as possible. 95% of the time is spent here. .. note:: **for developer** Data used in this function comes from _get_one_drug_one_feature_data method, which should also be kept as fast as possible. """ if drug_id not in self.drugIds: raise ValueError('Unknown drug name %s. Use e.g., %s' % (drug_id, self.drugIds[0])) if feature_name not in self.feature_names: # we start index at 3 to skip tissue/name/msi raise ValueError('Unknown feature name %s. Use e.g. one of %s' % (feature_name, self.feature_names[0:3])) # This extract the relevant data and some simple metrics # This is now pretty fast accounting for 45 seconds # for 265 drugs and 988 features odof = self._get_one_drug_one_feature_data(drug_id, feature_name) # if the status is False, it means the number of data points # in a category (e.g., positive feature) is too low. # If so, nothing to do, we return an 'empty' dictionary if odof.status is False: results = self._odof_dict.copy() results['FEATURE'] = feature_name results['DRUG_ID'] = odof.drug_id results['DRUG_NAME'] = odof.drug_name results['DRUG_TARGET'] = odof.drug_target results['N_FEATURE_pos'] = odof.Npos results['N_FEATURE_neg'] = odof.Nneg if production is True: # return a dict return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df # IMPORTANT: the order of the factors in the formula # is important. It does not change the total sum of square errors # but may change individual effects of the categorical components. # If a formula is provided, use statsmodels. Since it is slowish, # we implemented several cases as described in the doc for the 4 # following cases: # - TISSUE + MSI +MEDIA + FEATURE # - TISSUE + MSI + FEATURE # - MSI + FEATURE # - FEATURE if self.settings.regression_formula not in ["auto", None, ""]: # This populates the anova_pvalues attribute itself _ = self.anova_one_drug_one_feature_custom( drug_id, feature_name, formula=self.settings.regression_formula, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.analysis_type == 'PANCAN': # IMPORTANT: tissues are sorted alphabetically in R aov # function. Same in statsmodels but capitalised names # are sorted differently. In R, a<b<B<c but in Python, # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R, # not in python. Since in a linear regression # models, the order of the factor matters and the first # factor is used as a reference, we decided to use same # convention as in R. # see http://statsmodels.sourceforge.net/devel/contrasts.html # for a good explanation # We could use pd.get_dummies but pretty slow # instead we create the full matrix in init() method. # One issue is that some columns end up with sum == 0 # and needs to be dropped. df = self._tissue_dummies.loc[odof.masked_tissue.index] todrop = df.columns[df.values.sum(axis=0) == 0] if len(todrop) > 0: # use if since drop() is slow df = df.drop(todrop, axis=1) tissues = [x for x in df.columns if x.startswith('C(tissue')] df.drop(tissues[0], axis=1, inplace=True) # Here we set other variables with dataframe columns' names as # expected by OLS. if self.settings.include_media_factor == False: # make sure the media factor is not included todrop = [x for x in df.columns if x.startswith('C(media)')] df = df.drop(todrop, axis=1) else: # drop the first one for the regression medias = [x for x in df.columns if x.startswith('C(media')] if len(medias): df.drop(medias[0], axis=1, inplace=True) df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.include_MSI_factor is True: df = DummyDF() df.values = np.ones((3, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_msi.values df.values[2] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) else: df = DummyDF() df.values = np.ones((2, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) key = str(drug_id) + "__" + feature_name if self.sampling and key not in self.pvalues_features.keys(): # This can be computed for a drug once for all # no need to redo it for each feature ? # If the length of Y is too small (e.g., < 20) the results may not be # great. This can be check zith the errors self.samples1 = [] self.samples2 = [] self.samples3 = [] Y = odof.Y.copy() N = self.sampling pb = Progress(N, 20) for i in range(0, N): # To get the random distribution, shuffle Y # and noise not required # To get the noise effects, do not shuffle and set noise to # something different from 0 noise = 0.0 pylab.shuffle(Y) #data_lm = OLS(Y, df.values).fit() data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit() anova_pvalues = self._get_anova_summary(data_lm, output='dict', odof=odof) try: self.samples1.append(anova_pvalues['msi']) except: pass self.samples2.append(anova_pvalues['feature']) try: self.samples3.append(anova_pvalues['tissue']) except: pass #pb.animate(i+1) import fitter ff = fitter.Fitter(-pylab.log10(self.samples2)) dist = "genexpon" ff.distributions = [dist] ff.fit() self.pvalues_features[key] = { 'error': ff.df_errors.loc[dist].values[0], 'params': ff.fitted_param[dist], 'feature': feature_name, 'N': len(Y) } if show is True: boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory, fontsize=fontsize) boxplot.boxplot_association(fignum=1) # a boxplot to show cell lines effects. This requires # the settings.analyse_type to be PANCAN if self.settings.analysis_type == 'PANCAN': boxplot.boxplot_pancan(fignum=2, mode='tissue') if self.settings.include_MSI_factor: boxplot.boxplot_pancan(fignum=3, mode='msi') if self.settings.include_media_factor: boxplot.boxplot_pancan(fignum=3, mode='media') # about 30% of the time spent in creating the DataFrame... if production is True: return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df
def mc_trajectory(self, distribution): counts = multinomial(self.frames, distribution) samples = array(tuple(self.mc_counts2samples(counts))) shuffle(samples) return samples
def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory='.'): """Compute ANOVA and various tests on one drug and one feature :param drug_id: a valid drug identifier :param feature_name: a valid feature name :param bool show: show some plots :param str directory: where to save the figure. :param bool production: if False, returns a dataframe otherwise a dictionary. This is to speed up analysis when scanning the drug across all features. .. note:: **for developer** this is the core of tha analysis and should be kept as fast as possible. 95% of the time is spent here. .. note:: **for developer** Data used in this function comes from _get_one_drug_one_feature_data method, which should also be kept as fast as possible. """ if drug_id not in self.drugIds: raise ValueError('Unknown drug name %s. Use e.g., %s' % (drug_id, self.drugIds[0])) if feature_name not in self.feature_names: # we start index at 3 to skip tissue/name/msi raise ValueError('Unknown feature name %s. Use e.g. one of %s' % (feature_name, self.feature_names[0:3])) # This extract the relevant data and some simple metrics # This is now pretty fast accounting for 45 seconds # for 265 drugs and 988 features odof = self._get_one_drug_one_feature_data(drug_id, feature_name) drug_name = self.drug_decode.get_name(drug_id) drug_target = self.drug_decode.get_target(drug_id) # if the status is False, it means the number of data points # in a category (e.g., positive feature) is too low. # If so, nothing to do, we return an 'empty' dictionary if odof.status is False: results = self._odof_dict.copy() results['FEATURE'] = feature_name results['DRUG_ID'] = drug_id results['DRUG_NAME'] = drug_name results['DRUG_TARGET'] = drug_target results['N_FEATURE_pos'] = odof.Npos results['N_FEATURE_neg'] = odof.Nneg if production is True: # return a dict return results else: # or a dataframe; note that index is not relevant here but # required. df = pd.DataFrame(results, index=[1]) return df # with the data extract, we can now compute the regression. # In R or statsmodels, the regression code is simple since # it is based on the formula notation (Y~C(msi)+feature) # This is also possible in statsmodels library, however, # this relies on patsy, which is very slow as compared to the # statsmodels without formula. #### self._mydata = pd.DataFrame({'Y':self.Y, #### 'tissue':self.masked_tissue, #### 'msi': self.masked_msi, 'feature':self.masked_features}) #### self.data_lm = ols('Y ~ C(tissue) + C(msi) + feature', #### data=self._mydata, missing='none').fit() #Specify C is category # IMPORTANT: the order of the factors in the formula # is important. It does not change the total sum of square errors # but may change individual effects of the categorical # components. # Instead of using ols function, we use the OLS one so we cannot # use formula. Instead, we need to create manually the input # data. In the case of categorical data (tissue), we need to # create the dummy variable, which is done in the constructor # once for all (slow otherwise). if self.settings.analysis_type == 'PANCAN': # IMPORTANT: tissues are sorted alphabetically in R aov # function. Same in statsmodels but capitalised names # are sorted differently. In R, a<b<B<c but in Python, # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R, # not in python. Since in a linear regression # models, the order of the factor matters and the first # factor is used as a reference, we decided to use same # convention as in R. # see http://statsmodels.sourceforge.net/devel/contrasts.html # for a good explanation #self._mydata = pd.DataFrame({'Y': odof.Y.copy(), # 'tissue':odof.masked_tissue, # 'msi': odof.masked_msi, 'feature': odof.masked_features}) #self.data_lm2 = ols('Y ~ C(tissue) + C(msi) + feature', # data=self._mydata).fit() #Specify C for Categorical # from statsmodels.stats.anova import anova_lm # import statsmodels.formula.api as smf # df = pd.DataFrame({'Y': odof.Y.copy(), # 'tissue':odof.masked_tissue,'media' # odof.masked_media, 'msi': odof.masked_msi, # 'feature': odof.masked_features}) # lm = smf.ols('Y~C(tissue)+C(media)+C(msi)+feature', # data=df).fit() # anova_lm(lm) # The code above gives same answer as the code in gdsctools # but is slower # We could use pd.get_dummies but pretty slow # instead we create the full matrix in init() method. # One issue is that some columns end up with sum == 0 # and needs to be dropped. df = self._tissue_dummies.ix[odof.masked_tissue.index] todrop = df.columns[df.values.sum(axis=0) == 0] if len(todrop) > 0: # use if since drop() is slow df = df.drop(todrop, axis=1) # Here we set other variables with dataframe columns' names as # expected by OLS. if self.settings.include_media_factor == False: todrop = [x for x in df.columns if x.startswith('C(media)')] df = df.drop(todrop, axis=1) df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features.values self.Y = odof.Y self.EV = df.values # The regression and anova summary are done here # """if self.settings.regression_method == 'ElasticNet': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=self.settings.regression_L1_wt) elif self.settings.regression_method == 'OLS': self.data_lm = OLS(odof.Y, df.values).fit() elif self.settings.regression_method == 'Ridge': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=0) elif self.settings.regression_method == 'Lasso': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=1) """ # example of computing null model ? # Example of computing pvalues ourself # with 100 000 samples, we can get a smooth distribution # that we can then fit with fitter. good distribution # for the raw data is uniform one but if we take the log10, # we have lots of possible distrob such as beta, exponweib, gamma, #.... elif self.settings.include_MSI_factor is True: #self._mydata = pd.DataFrame({'Y': odof.Y, # 'msi': odof.masked_msi, 'feature': odof.masked_features}) #self.data_lm = ols('Y ~ C(msi) + feature', # data=self._mydata).fit() #Specify C for Categorical df = pd.DataFrame() df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features.values df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg)) #self.data_lm = OLS(odof.Y, df.values).fit() else: df = pd.DataFrame() df['feature'] = odof.masked_features.values df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg)) #self.data_lm = OLS(odof.Y, df.values).fit() #self._mydata = pd.DataFrame({'Y': odof.Y, # 'feature': odof.masked_features}) #self.data_lm = ols('Y ~ feature', # data=self._mydata).fit() #Specify C for Categorical if self.settings.regression_method == 'ElasticNet': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=self.settings.regression_L1_wt) elif self.settings.regression_method == 'OLS': self.data_lm = OLS(odof.Y, df.values).fit() elif self.settings.regression_method == 'Ridge': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=0) elif self.settings.regression_method == 'Lasso': self.data_lm = OLS(odof.Y, df.values).fit_regularized( alpha=self.settings.regression_alpha, L1_wt=1) key = drug_id + "__" + feature_name if self.sampling and key not in self.pvalues_features.keys(): # This can be computed for a drug once for all # no need to redo it for each feature ? # If the length of Y is too small (e.g., < 20) the results may not be # great. This can be check zith the errors self.samples1 = [] self.samples2 = [] self.samples3 = [] Y = odof.Y.copy() N = self.sampling pb = Progress(N, 20) for i in range(0, N): # To get the random distribution, shuffle Y # and noise not required # To get the noise effects, do not shuffle and set noise to # something different from 0 noise = 0.0 pylab.shuffle(Y) #data_lm = OLS(Y, df.values).fit() data_lm = OLS(Y+noise*pylab.randn(len(Y)), df.values).fit() anova_pvalues = self._get_anova_summary(data_lm, output='dict') try:self.samples1.append(anova_pvalues['msi']) except:pass self.samples2.append(anova_pvalues['feature']) try:self.samples3.append(anova_pvalues['tissue']) except:pass #pb.animate(i+1) import fitter ff = fitter.Fitter(-pylab.log10(self.samples2)) dist = "genexpon" ff.distributions = [dist] ff.fit() self.pvalues_features[key] = { 'error': ff.df_errors.ix[dist].values[0], 'params': ff.fitted_param[dist], 'feature': feature_name, 'N':len(Y) } print(self.pvalues_features[key]) self.anova_pvalues = self._get_anova_summary(self.data_lm, output='dict') # Store the pvalues. Note that some may be missing so we use try # except, which is faster than if/else try: tissue_PVAL = self.anova_pvalues['tissue'] except: tissue_PVAL = None try: MSI_PVAL = self.anova_pvalues['msi'] except: MSI_PVAL = None try: FEATURE_PVAL = self.anova_pvalues['feature'] except: FEATURE_PVAL = None try: MEDIA_PVAL = self.anova_pvalues['media'] except: MEDIA_PVAL = None if show is True: boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory) boxplot.boxplot_association(fignum=1) # a boxplot to show cell lines effects. This requires # the settings.analyse_type to be PANCAN if self.settings.analysis_type == 'PANCAN': boxplot.boxplot_pancan(fignum=2, mode='tissue') if self.settings.include_MSI_factor: boxplot.boxplot_pancan(fignum=3, mode='msi') results = {'FEATURE': feature_name, 'DRUG_ID': drug_id, 'DRUG_NAME': drug_name, 'DRUG_TARGET': drug_target, 'N_FEATURE_pos': odof.Npos, 'N_FEATURE_neg': odof.Nneg, 'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean, 'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean, 'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50, 'FEATURE_pos_IC50_sd': odof.pos_IC50_std, 'FEATURE_neg_IC50_sd': odof.neg_IC50_std, 'FEATURE_IC50_effect_size': odof.effectsize_ic50, 'FEATURE_pos_Glass_delta': odof.pos_glass, 'FEATURE_neg_Glass_delta': odof.neg_glass, 'ANOVA_FEATURE_pval': FEATURE_PVAL, 'ANOVA_TISSUE_pval': tissue_PVAL, 'ANOVA_MSI_pval': MSI_PVAL, 'ANOVA_MEDIA_pval': MEDIA_PVAL, 'FEATURE_IC50_T_pval': odof.ttest # pvalues is in index 1 } # 12% of the time here if production is True: return results else: df = pd.DataFrame(results, index=[1]) return df
def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory="."): """ :param drug_id: a valid drug identifier :param feature_name: a valid feature name :param bool show: show boxplots with the different factor used :param str directory: where to save the figure. :param bool production: if False, returns a dataframe otherwise a dictionary. This is to speed up analysis when scanning the drug across all features. .. note:: **for developer** this is the core of the analysis and should be kept as fast as possible. 95% of the time is spent here. .. note:: **for developer** Data used in this function comes from _get_one_drug_one_feature_data method, which should also be kept as fast as possible. data = data.replace(np.inf, 0) """ if drug_id not in self.drugIds: raise ValueError("Unknown drug name %s. Use e.g., %s" % (drug_id, self.drugIds[0])) if feature_name not in self.feature_names: # we start index at 3 to skip tissue/name/msi raise ValueError("Unknown feature name %s. Use e.g. one of %s" % (feature_name, self.feature_names[0:3])) # This extract the relevant data and some simple metrics # This is now pretty fast accounting for 45 seconds # for 265 drugs and 988 features odof = self._get_one_drug_one_feature_data(drug_id, feature_name) # if the status is False, it means the number of data points # in a category (e.g., positive feature) is too low. # If so, nothing to do, we return an 'empty' dictionary if odof.status is False: results = self._odof_dict.copy() results["FEATURE"] = feature_name results["DRUG_ID"] = odof.drug_id results["DRUG_NAME"] = odof.drug_name results["DRUG_TARGET"] = odof.drug_target results["N_FEATURE_pos"] = odof.Npos results["N_FEATURE_neg"] = odof.Nneg if production is True: # return a dict return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df # IMPORTANT: the order of the factors in the formula # is important. It does not change the total sum of square errors # but may change individual effects of the categorical components. # If a formula is provided, use statsmodels. Since it is slowish, # we implemented several cases as described in the doc for the 4 # following cases: # - TISSUE + MSI +MEDIA + FEATURE # - TISSUE + MSI + FEATURE # - MSI + FEATURE # - FEATURE if self.settings.regression_formula not in ["auto", None, ""]: # This populates the anova_pvalues attribute itself _ = self.anova_one_drug_one_feature_custom( drug_id, feature_name, formula=self.settings.regression_formula, odof=odof ) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.analysis_type == "PANCAN": # IMPORTANT: tissues are sorted alphabetically in R aov # function. Same in statsmodels but capitalised names # are sorted differently. In R, a<b<B<c but in Python, # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R, # not in python. Since in a linear regression # models, the order of the factor matters and the first # factor is used as a reference, we decided to use same # convention as in R. # see http://statsmodels.sourceforge.net/devel/contrasts.html # for a good explanation # We could use pd.get_dummies but pretty slow # instead we create the full matrix in init() method. # One issue is that some columns end up with sum == 0 # and needs to be dropped. df = self._tissue_dummies.ix[odof.masked_tissue.index] todrop = df.columns[df.values.sum(axis=0) == 0] if len(todrop) > 0: # use if since drop() is slow df = df.drop(todrop, axis=1) tissues = [x for x in df.columns if x.startswith("C(tissue")] df.drop(tissues[0], axis=1, inplace=True) # Here we set other variables with dataframe columns' names as # expected by OLS. if self.settings.include_media_factor == False: # make sure the media factor is not included todrop = [x for x in df.columns if x.startswith("C(media)")] df = df.drop(todrop, axis=1) else: # drop the first one for the regression medias = [x for x in df.columns if x.startswith("C(media")] if len(medias): df.drop(medias[0], axis=1, inplace=True) df["C(msi)[T.1]"] = odof.masked_msi.values df["feature"] = odof.masked_features # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.include_MSI_factor is True: df = DummyDF() df.values = np.ones((3, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_msi.values df.values[2] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) else: df = DummyDF() df.values = np.ones((2, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) key = str(drug_id) + "__" + feature_name if self.sampling and key not in self.pvalues_features.keys(): # This can be computed for a drug once for all # no need to redo it for each feature ? # If the length of Y is too small (e.g., < 20) the results may not be # great. This can be check zith the errors self.samples1 = [] self.samples2 = [] self.samples3 = [] Y = odof.Y.copy() N = self.sampling pb = Progress(N, 20) for i in range(0, N): # To get the random distribution, shuffle Y # and noise not required # To get the noise effects, do not shuffle and set noise to # something different from 0 noise = 0.0 pylab.shuffle(Y) # data_lm = OLS(Y, df.values).fit() data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit() anova_pvalues = self._get_anova_summary(data_lm, output="dict", odof=odof) try: self.samples1.append(anova_pvalues["msi"]) except: pass self.samples2.append(anova_pvalues["feature"]) try: self.samples3.append(anova_pvalues["tissue"]) except: pass # pb.animate(i+1) import fitter ff = fitter.Fitter(-pylab.log10(self.samples2)) dist = "genexpon" ff.distributions = [dist] ff.fit() self.pvalues_features[key] = { "error": ff.df_errors.ix[dist].values[0], "params": ff.fitted_param[dist], "feature": feature_name, "N": len(Y), } if show is True: boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory) boxplot.boxplot_association(fignum=1) # a boxplot to show cell lines effects. This requires # the settings.analyse_type to be PANCAN if self.settings.analysis_type == "PANCAN": boxplot.boxplot_pancan(fignum=2, mode="tissue") if self.settings.include_MSI_factor: boxplot.boxplot_pancan(fignum=3, mode="msi") if self.settings.include_media_factor: boxplot.boxplot_pancan(fignum=3, mode="media") # about 30% of the time spent in creating the DataFrame... if production is True: return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df