def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf: 0, -np.inf: 0}) except: pass # create some data self._set_sensible_df() self.company = None
def __init__(self, data, sep="\t", settings=None): """.. rubric:: Constructor :param data: an :class:`~gdsctools.anova.ANOVAResults` instance or a dataframe with the proper columns names (see below) :param settings: an instance of :class:`~gdsctools.settings.ANOVASettings` Expected column names to be found if a filename is provided:: ANOVA_FEATURE_pval ANOVA_FEATURE_FDR FEATURE_delta_MEAN_IC50 FEATURE_IC50_effect_size N_FEATURE_pos N_FEATURE_pos FEATURE DRUG_ID If the plotting is too slow, you can use the :meth:`selector` to prune the results (most of the data are noise and overlap on the middle bottom area of the plot with little information. """ # a copy since we do may change the data try: # an ANOVAResults contains a df attribute self.df = data.df.copy() except: # probably a dataframe self.df = data.copy() # this is redundant could reuse the input ?? if settings is None: from gdsctools.settings import ANOVASettings self.settings = ANOVASettings() else: self.settings = AttrDict(**settings) self.figtools = Savefig() self.figtools.directory = self.settings.directory self.drugs = set(self.df[self._colname_drugid]) self.features = set(self.df[self._colname_feature]) # intensive calls made once for all self.groups_by_drugs = self.df.groupby(self._colname_drugid).groups self.groups_by_features = self.df.groupby(self._colname_feature).groups
def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf:0, -np.inf:0}) except: pass # create some data self._set_sensible_df() self.company = None
def __init__(self, gdsc, results, sep="\t", drug_decode=None): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all` """ self.figtools = Savefig() self.gdsc = gdsc self.df = ANOVAResults(results).df # this does a copy and sanity check self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) # create some data self._set_sensible_df() # just to create the directory ReportMAIN(directory=self.settings.directory)
class VolcanoANOVA(object): """Utilities related to volcano plots This class is used in :mod:`gdsctools.anova` but can also be used independently as in the example below. .. plot:: :include-source: :width: 80% from gdsctools import ANOVA, ic50_test, VolcanoANOVA an = ANOVA(ic50_test) # retrict analysis to a tissue to speed up computation an.set_cancer_type('lung_NSCLC') # Perform the entire analysis results = an.anova_all() # Plot volcano plot of pvalues versus signed effect size v = VolcanoANOVA(results) v.volcano_plot_all() .. note:: Within an IPython shell, you should be able to click on a circle and the title will be updated with the name of the drug/feature and FDR value. :Legend and color conventions: The green circles indicate significant hits that are resistant while reds show sensitive hits. Circles are colored if there are below the FDR_threshold AND below the pvalue_threshold AND if the signed effect size is above the effect_threshold. """ _colname_pvalue = 'ANOVA_FEATURE_pval' _colname_qvalue = 'ANOVA_FEATURE_FDR' _colname_drugid = 'DRUG_ID' _colname_feature = 'FEATURE' _colname_deltas = 'FEATURE_delta_MEAN_IC50' _colname_effect_size = 'FEATURE_IC50_effect_size' _colname_N_feature_pos = 'N_FEATURE_pos' def __init__(self, data, sep="\t", settings=None): """.. rubric:: Constructor :param data: an :class:`~gdsctools.anova.ANOVAResults` instance or a dataframe with the proper columns names (see below) :param settings: an instance of :class:`~gdsctools.settings.ANOVASettings` Expected column names to be found if a filename is provided:: ANOVA_FEATURE_pval ANOVA_FEATURE_FDR FEATURE_delta_MEAN_IC50 FEATURE_IC50_effect_size N_FEATURE_pos N_FEATURE_pos FEATURE DRUG_ID If the plotting is too slow, you can use the :meth:`selector` to prune the results (most of the data are noise and overlap on the middle bottom area of the plot with little information. """ # a copy since we do may change the data try: # an ANOVAResults contains a df attribute self.df = data.df.copy() except: # probably a dataframe self.df = data.copy() # this is redundant could reuse the input ?? if settings is None: from gdsctools.settings import ANOVASettings self.settings = ANOVASettings() else: self.settings = AttrDict(**settings) self.figtools = Savefig() self.figtools.directory = self.settings.directory self.drugs = set(self.df[self._colname_drugid]) self.features = set(self.df[self._colname_feature]) # intensive calls made once for all self.groups_by_drugs = self.df.groupby(self._colname_drugid).groups self.groups_by_features = self.df.groupby(self._colname_feature).groups def selector(self, df, Nbest=1000, Nrandom=1000, inplace=False): """Select only the first N best rows and N random ones Sometimes, there are tens of thousands of associations and future analysis will include more features and drugs. Plotting volcano plots should therefore be fast and scalable. Here, we provide a naive way of speeding up the plotting by selecting only a subset of the data made of Nbest+Nrandom associations. :param df: the input dataframe with ANOVAResults :param int Nbest: how many of the most significant association should be kept :param int Nrandom: on top of the Nbest significant association, set how many other randomly chosen associations are to be kept. :return: pruned dataframe """ if len(df) < Nbest: return df Nmax = Nbest + Nrandom N = len(df) if N > Nbest: x = range(Nbest, N) pylab.shuffle(x) n2pick = min(N, Nmax) - Nbest indices = range(0, Nbest) + x[0:n2pick] else: indices = range(0, Nbest) # indices in the index may not be order indices = [df.index[xx] for xx in indices] df = df.ix[indices] if inplace is True: self.df = df else: return df def volcano_plot_all_drugs(self): """Create a volcano plot for each drug and save in PNG files Each filename is set to **volcano_<drug identifier>.png** """ drugs = list(self.df[self._colname_drugid].unique()) pb = Progress(len(drugs), 1) for i, drug in enumerate(drugs): self.volcano_plot_one_drug(drug) self.savefig("volcano_%s.png" % drug, size_inches=(10, 10)) pb.animate(i+1) def volcano_plot_all_features(self): """Create a volcano plot for each feature and save in PNG files Each filename is set to **volcano_<feature name>.png** """ features = list(self.df[self._colname_feature].unique()) print('Creating image for each feature (using all drugs)') pb = Progress(len(features), 1) for i, feature in enumerate(features): self.volcano_plot_one_feature(feature) self.savefig("volcano_%s.png" % feature, size_inches=(10, 10)) pb.animate(i+1) def volcano_plot_all(self): """Create an overall volcano plot for all associations This method saves the picture in a PNG file named **volcano_all.png**. """ # no annotations for all features. # this is slow, we can drop non relevant data data = self._get_volcano_sub_data('ALL') data['annotation'] = ['' for x in range(len(data))] self._volcano_plot(data, title='all drugs all features') def _get_fdr_from_pvalue_interp(self, pvalue): """Here, FDR are computed using an interpolation""" pvalue += 1e-15 qvals = self.df[self._colname_qvalue] pvals = self.df[self._colname_pvalue] ya = qvals[pvals < pvalue].max() yb = qvals[pvals > pvalue].min() xa = pvals[pvals < pvalue].max() xb = pvals[pvals > pvalue].min() dx = xb - xa dy = yb - ya yc = ya + dy * (pvalue - xa) / dx return yc def _get_pvalue_from_fdr(self, fdr): """Get pvalue for a given FDR threshold This is equivalent to v17 of the R version but is not very precise we should use _get_pvalue_from_fdr_interp instead. """ qvals = self.df[self._colname_qvalue] pvals = self.df[self._colname_pvalue] if isinstance(fdr, list): pvalues = [pvals[qvals < this].max() for this in fdr] return pvalues else: return pvals[qvals < fdr].max() def _get_pvalue_from_fdr_interp(self, fdr): # same as get_pvalue_from_fdr but with a linear interpolation fdr += 1e-15 qvals = self.df[self._colname_qvalue] pvals = self.df[self._colname_pvalue] ya = pvals[qvals <= fdr].max() yb = pvals[qvals > fdr].min() xa = qvals[qvals <= fdr].max() xb = qvals[qvals > fdr].min() dx = xb - xa dy = yb - ya xc = fdr yc = ya + dy * (xc - xa) / dx return yc def _get_volcano_global_data(self): # using all data colname = self._colname_N_feature_pos minN = self.df[colname].min() maxN = self.df[colname].max() pvalues = self._get_pvalue_from_fdr(self.settings.FDR_threshold) return {'minN': minN, 'maxN': maxN, 'pvalues': (self.settings.FDR_threshold, pvalues)} def _get_volcano_sub_data(self, mode, target=None): # Return data needed for each plot # TODO could be simplified but works for now # groups created in the constructor once for all if mode == self._colname_drugid: subdf = self.df.ix[self.groups_by_drugs[target]] texts = subdf[self._colname_feature] elif mode == 'FEATURE': subdf = self.df.ix[self.groups_by_features[target]] texts = subdf[self._colname_drugid] elif mode == 'ALL': # nothing to do, get all data subdf = self.df texts = subdf[self._colname_feature] # TODO + drug else: raise ValueError("mode parameter must be in [FEATURE, %s, ALL]" % (self._colname_drugid)) # replaced by groups created in the constructor #subdf = self.df[self.df[mode] == target] deltas = subdf[self._colname_deltas] effects = subdf[self._colname_effect_size] signed_effects = list(np.sign(deltas) * effects) qvals = list(subdf[self._colname_qvalue]) pvals = list(subdf[self._colname_pvalue]) #assocs = list(subdf['ASSOC_ID']) colors = [] annotations = [] data = pd.DataFrame(index=range(len(qvals))) data['pvalue'] = pvals data['signed_effect'] = signed_effects data['Feature'] = list(subdf[self._colname_feature]) data['Drug'] = list(subdf[self._colname_drugid]) data['text'] = texts.values #data['Assoc'] = assocs ## !! here, we need to use .values since the pandas dataframe # index goes from 1 to N but the origignal indices in subdf # may not be from 1 to N but random between 1 and M>>N data['FDR'] = subdf[self._colname_qvalue].values annotations = [] # just an alias # FIXME: why do we have a switch here for PANCAN ? FDR_threshold = self.settings.FDR_threshold if self.settings.analysis_type == 'PANCAN': for sign, qval, pval in zip(signed_effects, qvals, pvals): if sign <= -self.settings.effect_threshold and \ qval <= FDR_threshold and \ pval <= self.settings.pvalue_threshold: colors.append('green') annotations.append(True) elif sign >= self.settings.effect_threshold and \ qval <= FDR_threshold and \ pval <= self.settings.pvalue_threshold: colors.append('red') annotations.append(True) else: colors.append('black') annotations.append(False) else: for delta, qval, pval in zip(deltas, qvals, pvals): if pval <= self.settings.pvalue_threshold and \ qval <= FDR_threshold and delta < 0: colors.append('green') annotations.append(True) elif pval <= self.settings.pvalue_threshold and \ qval <= FDR_threshold and delta > 0: colors.append('red') annotations.append(True) else: colors.append('black') annotations.append(False) # here we normalise wrt the drug. In R code, normalised # my max across all data (minN, maxN) colname = self._colname_N_feature_pos markersize = subdf[colname] / subdf[colname].max() markersize = list(markersize * 800) markersize = [x if x > 80 else 80 for x in markersize] data['color'] = colors data['annotation'] = annotations data['markersize'] = markersize return data def volcano_plot_one_feature(self, feature): """Volcano plot for one feature (all drugs) :param feature: a valid feature name to be found in the results """ assert feature in self.features, 'unknown feature name' # FEATURE is the mode's name, not a column's name data = self._get_volcano_sub_data('FEATURE', feature) self._volcano_plot(data, title=feature) def volcano_plot_one_drug(self, drug_id): """Volcano plot for one drug (all genomic features) :param drug_id: a valid drug identifier to be found in the results """ assert drug_id in self.drugs, 'unknown drug name' data = self._get_volcano_sub_data(self._colname_drugid, drug_id) self._volcano_plot(data, title=drug_id) def _volcano_plot(self, data, title=''): """Main volcano plot function called by other methods such as volcano_plot_all""" # This functio is a bit complicated because it does create a few tricky # plots # It creates a volcano plot, which is the easy part # Then, it creates tooltips for the user interface in an IPython # shell using a callback to 'onpick' function coded here below # !! There seem to bes a memory leak in this function due to matplotlib # This is not easy to track down and should have no impact now that # ANOVAReport using JS instead of matplotlib data = data.replace(np.inf, 0) data = data.replace(-np.inf, 0) colors = list(data['color'].values) pvalues = data['pvalue'].values signed_effects = data['signed_effect'].values markersize = data['markersize'].values Y = -np.log10(list(pvalues)) # should be cast to list ? num = 1 #pylab.close(num) fig = pylab.figure(num=1) fig.clf() ax = fig.add_subplot(111) ax.set_axis_bgcolor('#EEEEEE') ax.cla() # TODO signed effects may be inf why ? X = [easydev.precision(x, digit=2) for x in signed_effects] Y = [easydev.precision(y, digit=2) for y in Y] # Using scatter() is slow as compared to plot() # However, plot cannot take different sizes/colors scatter = ax.scatter(X, Y, s=markersize, alpha=0.3, c=colors, linewidth=1, picker=True) scatter.set_zorder(11) m = abs(signed_effects.min()) M = abs(signed_effects.max()) pylab.xlabel("Signed effect size", fontsize=self.settings.fontsize) pylab.ylabel('-log10(pvalues)', fontsize=self.settings.fontsize) l = max([m, M]) * 1.1 pylab.xlim([-l, l]) ax.grid(color='white', linestyle='solid') # some aliases fdr = self.settings.FDR_threshold if fdr < self.df[self._colname_qvalue].min(): fdr = self.df[self._colname_qvalue].min() fdrs = sorted(self.settings.volcano_additional_FDR_lines) fdrs = fdrs[::-1] # reverse sorting styles = ['--', ':', '-.'] if self.settings.volcano_FDR_interpolation is True: get_pvalue_from_fdr = self._get_pvalue_from_fdr_interp else: get_pvalue_from_fdr = self._get_pvalue_from_fdr pvalue = get_pvalue_from_fdr(fdr) ax.axhline(-np.log10(pvalue), linestyle='--', lw=2, color='red', alpha=1, label="FDR %s " % fdr + " %") for i, this in enumerate(fdrs): if this < self.df[self._colname_qvalue].min() or\ this > self.df[self._colname_qvalue].max(): continue pvalue = get_pvalue_from_fdr(this) ax.axhline(-np.log10(pvalue), linestyle=styles[i], color='red', alpha=1, label="FDR %s " % this +" %") pylab.ylim([0, pylab.ylim()[1]*1.2]) # times 1.2 to put the legend ax.axvline(0, color='gray', alpha=0.8, lw=2) axl = pylab.legend(loc='best') axl.set_zorder(10) # in case there is a circle behind the legend. #self.ax = ax #self.axx = ax.twinx() #self.common_ticks = ax.get_yticks() #self.common_ylim = ax.get_ylim() #pvals = self.df[self._colname_pvalue] #y1 = pvals.min() #y2 = pvals.max() #fdr1 = self._get_fdr_from_pvalue_interp(y1) #fdr2 = self._get_fdr_from_pvalue_interp(y2-2e-15) # make sure it exists #self.axx.set_ylim([fdr2, fdr1]) #self.axx.set_ylabel('FDR \%', fontsize=self.settings.fontsize) # For the static version title_handler = pylab.title("%s" % str(title).replace("_"," "), fontsize=self.settings.fontsize/1.2) labels = [] # This code allows the ipython user to click on the matplotlib figure # to get information about the drug and feature of a given circles. def onpick(event): ind = event.ind[0] try: title = str(str(data.ix[ind]['Drug'])) + " / " + str(data.ix[ind].Feature) title += "\nFDR=" + "%.4e" % data.ix[ind]['FDR'] title_handler.set_text(title.replace("_"," ")) except: print('Failed to create new title on click') print(data.ix[ind].T) fig.canvas.draw() # keep track on the id for further memory release # For more info search for "matplotlib memory leak mpl_connect" self.cid = fig.canvas.mpl_connect('pick_event', onpick) # for the JS version # TODO: for the first 1 to 2000 entries ? labels = [] self.data = data for i, row in data[['Drug', 'Feature', 'FDR']].iterrows(): template = """ <table border="1" class="dataframe"> <tbody> <tr> <th>Drug</th> <td>%(Drug)s</td> </tr> <tr> <th>Feature</th> <td>%(Feature)s</td> </tr> <tr> <th>FDR</th> <td>%(FDR)s</td> </tr> </tbody> </table>""" % row.to_dict() labels.append(template) # this is more elegant but slower #label = row.to_frame() #label.columns = ['Row {0}'.format(i)] #labels.append(str(label.to_html(header=False))) self.scatter = scatter self.current_fig = fig # not sure is this is required. could be a memory leak here import gc gc.collect() def savefig(self, filename, size_inches=(10, 10)): # Save the PNG first. The savefig automatically set the size # to a defined set and back to original figsize. self.figtools.savefig(filename + '.png', size_inches=size_inches)
class ANOVAReport(object): """Class used to interpret the results and create final HTML report Results is a data structure returned by :meth:`ANOVA.anova_all`. :: from gdsctools import * # Perform the analysis itself to get a set of results (dataframe) an = ANOVA(ic50_test) results = an.anova_all() # now, we can create the report. r = ANOVAReport(gdsc=an, results=results) # we can tune some settings r.settings.pvalue_threshold = 0.001 r.settings.FDR_threshold = 28 r.settings.directory = 'testing' r.create_html_pages() .. rubric:: Significant association An association is significant if - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is below 0, otherwise it is **resistant**. """ def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf: 0, -np.inf: 0}) except: pass # create some data self._set_sensible_df() self.company = None # just to create the directory # ReportMain(directory=self.settings.directory, verbose=self.verbose) def _get_ndrugs(self): return len(self.df[self._colname_drug_id].unique()) n_drugs = property(_get_ndrugs, doc="return number of drugs") def _get_ntests(self): return len(self.df.index) n_tests = property(_get_ntests) def _get_ncelllines(self): return len(self.gdsc.features.df.index) n_celllines = property(_get_ncelllines, doc="return number of cell lines") def _df_append(self, df, data): count = len(df) df.loc[count] = data return df def diagnostics(self): """Return summary of the analysis (dataframe)""" self._set_sensible_df() df = pd.DataFrame({'text': [], 'value': []}) n_features = len(self.gdsc.features.df.columns) n_features -= self.gdsc.features.shift n_drugs = len(self.df[self._colname_drug_id].unique()) N = float(n_drugs * n_features) if N == 0: ratio = 0 else: ratio = float(self.n_tests) / (N) * 100 try: ratio = easydev.precision(ratio, digit=2) except: # Fixme: this is a hack for the infinite values but should not # happen... ratio = 0 msg = "Type of analysis" df = self._df_append(df, [msg, self.settings.analysis_type]) msg = "Total number of possible drug/feature associations" df = self._df_append(df, [msg, int(N)]) msg = "Total number of ANOVA tests performed" df = self._df_append(df, [msg, self.n_tests]) msg = "Percentage of tests performed" df = self._df_append(df, [msg, ratio]) # trick to have an empty line df = self._df_append(df, ["", ""]) msg = "Total number of tested drugs" df = self._df_append(df, [msg, n_drugs]) msg = "Total number of genomic features used" df = self._df_append(df, [msg, n_features]) msg = "Total number of screened cell lines" df = self._df_append(df, [msg, self.n_celllines]) msg = "MicroSatellite instability included as factor" msi = self.settings.include_MSI_factor df = self._df_append(df, [msg, msi]) # trick to have an empty line df = self._df_append(df, ["", ""]) nsens = len(self.sensible_df) nres = len(self.resistant_df) msg = "Total number of significant associations" df = self._df_append(df, [msg, nsens + nres]) msg = " - sensitive" df = self._df_append(df, [msg, nsens]) msg = " - resistant" df = self._df_append(df, [msg, nres]) msg = "p-value significance threshold" df = self._df_append(df, [msg, self.settings.pvalue_threshold]) msg = "FDR significance threshold" df = self._df_append(df, [msg, self.settings.FDR_threshold]) p1, p2 = self._get_pval_range() msg = 'Range of significant p-values' value = "[{:.4}, {:.4}]".format(p1, p2) df = self._df_append(df, [msg, value]) f1, f2 = self._get_fdr_range() msg = "Range of significant % FDRs" value = '[{:.4} {:.4}]'.format(f1, f2) df = self._df_append(df, [msg, value]) return df def _get_pval_range(self): """Get pvalues range of the significant hits""" nsens = len(self.sensible_df) nres = len(self.resistant_df) N = nsens + nres if N == 0: return 0., 0. name = self.varname_pval data = self.df[name].iloc[0:N] m, M = data.min(), data.max() return m, M def _get_fdr_range(self): """Get FDR range of the significant hits""" name = self.varname_qval data = self.df[name][(self.df[name] < self.settings.FDR_threshold)] if len(data) == 0: return 0., 0. m, M = data.min(), data.max() return m, M def _set_sensible_df(self): # just an alias logand = np.logical_and # select sensible data set mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0 self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)] # select resistant data set mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0 self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)] def get_significant_set(self): """Return significant hits (resistant and sensible)""" # a property that is long to compute # and may change if FDR changes. self._set_sensible_df() df = pd.concat([self.sensible_df, self.resistant_df]) try: df.sort_values('ASSOC_ID', inplace=True) except: df.sort('ASSOC_ID', inplace=True) return df def _get_data(self, df_count_sensible, df_count_resistant): # we can drop all columns except one, which is renamed as count df1 = df_count_sensible['ASSOC_ID'] df1.name = 'sens assoc' df2 = df_count_resistant['ASSOC_ID'] df2.name = 'res assoc' # Now, we join the two TimeSeries (note that above, we selected only # one column so the dataframe was downcast to time series) df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer') # and set NA to zero df_count.fillna(0, inplace=True) # let us add a new column with the total df_count['total'] = df_count['sens assoc'] + df_count['res assoc'] # we want to sort by 'total' column and is equality by the name, # which is the index. So let us add the index temporarily as # a column, sort, and remove 'name' column afterwards df_count['name'] = df_count.index try: df_count.sort_values(by=['total', 'name'], ascending=[False, True], inplace=True) except: df_count.sort(columns=['total', 'name'], ascending=[False, True], inplace=True) df_count.drop('name', axis=1, inplace=True) return df_count def get_drug_summary_data(self): """Return dataframe with drug summary""" # get sensible and resistant sub dataframes self._set_sensible_df() # group by drug colname = self._colname_drug_id df_count_sensible = self.sensible_df.groupby(colname).count() df_count_resistant = self.resistant_df.groupby(colname).count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def drug_summary(self, top=50, fontsize=15, filename=None): """Return dataframe with significant drugs and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_drug_summary_data() if len(df_count): self._plot(df_count, 'drug', top) fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, size_inches=(12, 14), bbox_inches='tight') return df_count def get_feature_summary_data(self): """Return dataframe with feature summary""" # get sensible and resistant sub dataframes self._set_sensible_df() df_count_sensible = self.sensible_df.groupby('FEATURE').count() df_count_resistant = self.resistant_df.groupby('FEATURE').count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def feature_summary(self, filename=None, top=50, fontsize=15): """Return dataframe with significant features and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_feature_summary_data() if len(df_count) > 0: self._plot(df_count, 'feature', top) #fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, set_inches=(12, 14), bbox_inches='tight') return df_count def _plot(self, df_count, title_tag, top): """Used by drug_summary and feature_summary to plot the bar plot""" if top > len(df_count): top = len(df_count) df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']] labels = list(df.index) # add drug name if len(self.drug_decode) > 0: for i, label in enumerate(labels): if title_tag == 'drug': name = self.drug_decode.get_name(label) if name is not None: labels[i] = "{}-{}".format(labels[i], name) else: pass labels = [str(x).replace('_', ' ') for x in labels] # restrict size to first 30 characters labels = [x[0:30] for x in labels] ind = range(0, len(labels)) # reverse does not exist with python3 try: ind.reverse() except: ind = list(ind) ind.reverse() data1 = df['sens assoc'].values data2 = df['res assoc'].values pylab.figure(1) pylab.clf() p1 = pylab.barh(ind, data1, height=0.8, color='purple', label='sensitivity') p2 = pylab.barh(ind, data2, height=0.8, color='orange', left=data1, label='resistance') ax = pylab.gca() self.labels = labels ax.set_yticks([x + 0.5 for x in ind]) ax.set_yticklabels(labels, fontsize=12) xticks = ax.get_xticks() ax.set_xticklabels( [int(x) if divmod(x, 1)[1] == 0 else "" for x in xticks]) pylab.grid() pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \ "\nassociated with drug response", fontsize=self.settings.fontsize/1.2) pylab.xlabel(r'Number of significant associations (FDR %s %s %s) ' % ("$>$", self.settings.FDR_threshold, "$\%$"), fontsize=16) M = max(data1 + data2) #ax.set_xticks() #ax.set_xticklabels(labels, fontsize=fontsize) ax.set_xlim([0, M + 1]) pylab.legend(loc='lower right') try: pylab.tight_layout() except: pass def get_significant_hits(self, show=True): """Return a summary of significant hits :param show: show a plot with the distribution of significant hits .. todo:: to finalise """ fdrs = range(5, 50 + 1, 5) significants = [] significant_meaningful = [] strong_hits = [] full_strong_hits = [] MC1 = 1 MC2 = 2 mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1 mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2 mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1 mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2 maskMC = mask2 + mask3 + mask4 + mask5 for fdr in fdrs: # significant hits res = self.df['ANOVA_FEATURE_FDR'] < fdr significants.append(res.sum()) # meaningful hits indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr, maskMC) significant_meaningful.append(indices.sum()) # meaningful strong hits mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1 strong_hits.append(np.logical_or(mask1, mask2).sum()) # meaningful full strong hits mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1 full_strong_hits.append(np.logical_and(mask1, mask2).sum()) data = { 'significants': significants, 'full_strong_hits': full_strong_hits, 'strong_hits': strong_hits, 'significant_meaningful': significant_meaningful } df = pd.DataFrame(data, columns=[ 'significants', 'significant_meaningful', 'strong_hits', 'full_strong_hits' ], index=fdrs) df.columns = [ '1) significant', '2) 1 + meaningful', '3) 2 + strong', '4) 2+ very strong' ] if show is True: pylab.clf() ax = pylab.gca() df.plot(kind='bar', width=.8, color=['r', 'gray', 'orange', 'black'], rot=0, ax=ax) pylab.grid() # original is 'aquamarine4','cyan2','cornflowerblue ','aquamarine'), return df def __str__(self): self.df.info() return "" def create_html_associations(self): """Create an HTML page for each significant association The name of the output HTML file is **<association id>.html** where association id is stored in :attr:`df`. """ if self.verbose: print( "Creating individual HTML pages for each significant association" ) df = self.get_significant_set() drugs = df['DRUG_ID'].values features = df['FEATURE'].values assocs = df['ASSOC_ID'].values fdrs = df['ANOVA_FEATURE_FDR'].values N = len(df) pb = Progress(N) html = Association(self, drug='dummy', feature='dummy', fdr='dummy', company=self.company) for i in range(N): html.drug = drugs[i] html.feature = features[i] if str(assocs[i]).startswith("a"): html._filename = str(assocs[i]) + '.html' else: html._filename = "a" + str(assocs[i]) + '.html' html.fdr = fdrs[i] html.assoc_id = assocs[i] #html._init_report() # since we have one shared instance html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n") def create_html_features(self): """Create an HTML page for each significant feature""" df = self.get_significant_set() groups = df.groupby('FEATURE') if self.verbose: print("Creating individual HTML pages for each feature") N = len(groups.indices.keys()) pb = Progress(N) for i, feature in enumerate(groups.indices.keys()): # get the indices and therefore subgroup subdf = groups.get_group(feature) html = HTMLOneFeature(self, self.df, subdf, feature) html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n") def create_html_drugs(self): """Create an HTML page for each drug""" # group by drugs all_drugs = list(self.df['DRUG_ID'].unique()) df = self.get_significant_set() groups = df.groupby('DRUG_ID') if self.verbose: print("Creating individual HTML pages for each drug") N = len(groups.indices.keys()) N = len(all_drugs) pb = Progress(N) for i, drug in enumerate(all_drugs): # enumerate(groups.indices.keys()): # get the indices and therefore subgroup if drug in groups.groups.keys(): subdf = groups.get_group(drug) else: subdf = {} html = HTMLOneDrug(self, self.df, subdf, drug) html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n") def create_html_main(self, onweb=False): """Create HTML main document (summary)""" self._set_sensible_df() if self.verbose: print("Creating main HTML page in directory %s" % (self.settings.directory)) ReportMain(directory=self.settings.directory, verbose=self.verbose) buffer_ = self.settings.savefig self.settings.savefig = True html = HTMLPageMain(self, 'index.html') html._init_report() # created the directory html.create_report(onweb=onweb) self.settings.savefig = buffer_ def create_html_manova(self, onweb=True): """Create summary table with all significant hits :param onweb: open browser with the created HTML page. """ df = self.get_significant_set() page = HTMLPageMANOVA(self, df, self.company) page.create_report(onweb) def create_html_pages(self, onweb=True): """Create all HTML pages""" self.create_html_main(onweb=onweb) self.create_html_manova(onweb=False) self.create_html_drugs() self.create_html_features() self.create_html_associations() def onweb(self): from easydev import onweb onweb(self.settings.directory + os.sep + 'index.html')
class VolcanoANOVA(object): """Utilities related to volcano plots This class is used in :mod:`gdsctools.anova` but can also be used independently as in the example below. .. plot:: :include-source: :width: 80% from gdsctools import ANOVA, ic50_test, VolcanoANOVA an = ANOVA(ic50_test) # retrisct analysis to a tissue to speed up computation an.set_cancer_type('lung_NSCLC') # Perform the entire analysis results = an.anova_all() # Plot volcano plot of pvalues versus signed effect size v = VolcanoANOVA(results) v.volcano_plot_all() .. note:: Within an IPython shell, you should be able to click on a circle and the title will be updated with the name of the drug/feature and FDR value. .. note:: (**for developers**) A javascript version is also created on the fly using mpld3 library. It is used in the creation of the HTML report but one can use it as well in an ipython notebook:: # The **v** instance is as created in the example above # Then, type the following code to create an HTML with the # javascript plot embedded. import mpld3 htmljs = mpld3.fig_to_html(v.current_fig) fh = open('volcano_doc.html', 'w') fh.write(htmljs) fh.close() There are 5 methods to plot volcano plots depending on what you want to see - :meth:`volcano_plot_all` as above plots all associations - :meth:`volcano_plot_all_drugs` creates a volcano plot for each drug and save it into a PNG file. This method calls :meth:`volcano_plot_one_drug`. - :meth:`volcano_plot_all_features` creates a volcano plot for each feature and save it into a PNG file. This method calls :meth:`volcano_plot_one_feature`. """ _colname_pvalue = 'ANOVA_FEATURE_pval' _colname_qvalue = 'ANOVA_FEATURE_FDR' _colname_drugid = 'DRUG_ID' _colname_feature = 'FEATURE' _colname_deltas = 'FEATURE_delta_MEAN_IC50' _colname_effect_size = 'FEATURE_IC50_effect_size' _colname_N_feature_pos = 'N_FEATURE_pos' def __init__(self, data, sep="\t", settings=None): """.. rubric:: Constructor :param data: an :class:`~gdsctools.anova.ANOVAResults` instance or a dataframe with the proper columns names (see below) :param settings: an instance of :class:`~gdsctools.settings.ANOVASettings` Expected column names to be found if a filename is provided:: ANOVA_FEATURE_pval ANOVA_FEATURE_FDR FEATURE_delta_MEAN_IC50 FEATURE_IC50_effect_size N_FEATURE_pos N_FEATURE_pos FEATURE DRUG_ID If the plotting is too slow, you can use the :meth:`selector` to prune the results (most of the data are noise and overlap on the middle bottom area of the plot with little information. """ # a copy since we do may change the data try: # an ANOVAResults contains a df attribute self.df = data.df.copy() except: # probably a dataframe self.df = data.copy() # this is redundant could reuse the input ?? if settings is None: from gdsctools.settings import ANOVASettings self.settings = ANOVASettings() else: self.settings = AttrDict(**settings) self.figtools = Savefig() self.figtools.directory = self.settings.directory self.drugs = set(self.df[self._colname_drugid]) self.features = set(self.df[self._colname_feature]) # intensive calls made once for all self.groups_by_drugs = self.df.groupby(self._colname_drugid).groups self.groups_by_features = self.df.groupby(self._colname_feature).groups def selector(self, df, Nbest=1000, Nrandom=1000, inplace=False): """Select only the first N best rows and N random ones Sometimes, there are tens of thousands of associations and future analysis will include more features and drugs. Plotting volcano plots should therefore be fast and scalable. Here, we provide a naive way of speeding up the plotting by selecting only a subset of the data made of Nbest+Nrandom associations. :param df: the input dataframe with ANOVAResults :param int Nbest: how many of the most significant association should be kept :param int Nrandom: on top of the Nbest significant association, set how many other randomly chosen associations are to be kept. :return: pruned dataframe """ if len(df) < Nbest: return df Nmax = Nbest + Nrandom N = len(df) if N > Nbest: x = range(Nbest, N) pylab.shuffle(x) n2pick = min(N, Nmax) - Nbest indices = range(0, Nbest) + x[0:n2pick] else: indices = range(0, Nbest) # indices in the index may not be order indices = [df.index[xx] for xx in indices] df = df.ix[indices] if inplace is True: self.df = df else: return df def volcano_plot_all_drugs(self): """Create a volcano plot for each drug and save in PNG files Each filename is set to **volcano_<drug identifier>.png** """ drugs = list(self.df[self._colname_drugid].unique()) pb = Progress(len(drugs), 1) for i, drug in enumerate(drugs): self.volcano_plot_one_drug(drug) self.savefig_and_js("volcano_%s.png" % drug, size_inches=(10, 10)) pb.animate(i + 1) # This prevent memory leak. self.current_fig.canvas.mpl_disconnect(self.cid) try: import mpld3 mpld3.plugins.clear(self.current_fig) except: pass def volcano_plot_all_features(self): """Create a volcano plot for each feature and save in PNG files Each filename is set to **volcano_<feature name>.png** """ features = list(self.df[self._colname_feature].unique()) print('Creating image for each feature (using all drugs)') pb = Progress(len(features), 1) for i, feature in enumerate(features): self.volcano_plot_one_feature(feature) self.savefig_and_js("volcano_%s.png" % feature, size_inches=(10, 10)) pb.animate(i + 1) # This prevent memory leak. self.current_fig.canvas.mpl_disconnect(self.cid) try: import mpld3 mpld3.plugins.clear(self.current_fig) except: pass def volcano_plot_all(self): """Create an overall volcano plot for all associations This method saves the picture in a PNG file named **volcano_all.png**. """ # no annotations for all features. # this is slow, we can drop non relevant data data = self._get_volcano_sub_data('ALL') data['annotation'] = ['' for x in range(len(data))] self._volcano_plot(data, title='all drugs all features') def _get_fdr_from_pvalue_interp(self, pvalue): """Here, FDR are computed using an interpolation""" pvalue += 1e-15 qvals = self.df[self._colname_qvalue] pvals = self.df[self._colname_pvalue] ya = qvals[pvals < pvalue].max() yb = qvals[pvals > pvalue].min() xa = pvals[pvals < pvalue].max() xb = pvals[pvals > pvalue].min() dx = xb - xa dy = yb - ya yc = ya + dy * (pvalue - xa) / dx return yc def _get_pvalue_from_fdr(self, fdr): """Get pvalue for a given FDR threshold This is equivalent to v17 of the R version but is not very precise we should use _get_pvalue_from_fdr_interp instead but needs to be tested. """ qvals = self.df[self._colname_qvalue] pvals = self.df[self._colname_pvalue] if isinstance(fdr, list): pvalues = [pvals[qvals < this].max() for this in fdr] return pvalues else: return pvals[qvals < fdr].max() def _get_pvalue_from_fdr_interp(self, fdr): # same as get_pvalue_from_fdr but with a linear inerpolation fdr += 1e-15 qvals = self.df[self._colname_qvalue] pvals = self.df[self._colname_pvalue] ya = pvals[qvals < fdr].max() yb = pvals[qvals > fdr].min() xa = qvals[qvals < fdr].max() xb = qvals[qvals > fdr].min() dx = xb - xa dy = yb - ya xc = fdr yc = ya + dy * (xc - xa) / dx return yc def _get_volcano_global_data(self): # using all data colname = self._colname_N_feature_pos minN = self.df[colname].min() maxN = self.df[colname].max() pvalues = self._get_pvalue_from_fdr(self.settings.FDR_threshold) return { 'minN': minN, 'maxN': maxN, 'pvalues': (self.settings.FDR_threshold, pvalues) } def _get_volcano_sub_data(self, mode, target=None): # Return data needed for each plot # TODO could be simplified but works for now # groups created in the constructor once for all if mode == self._colname_drugid: subdf = self.df.ix[self.groups_by_drugs[target]] texts = subdf[self._colname_feature] elif mode == 'FEATURE': subdf = self.df.ix[self.groups_by_features[target]] texts = subdf[self._colname_drugid] elif mode == 'ALL': # nothing to do, get all data subdf = self.df texts = subdf[self._colname_feature] # TODO + drug else: raise ValueError("mode parameter must be in [FEATURE, %s, ALL]" % (self._colname_drugid)) # replaced by groups created in the constructor #subdf = self.df[self.df[mode] == target] deltas = subdf[self._colname_deltas] effects = subdf[self._colname_effect_size] signed_effects = list(np.sign(deltas) * effects) qvals = list(subdf[self._colname_qvalue]) pvals = list(subdf[self._colname_pvalue]) #assocs = list(subdf['ASSOC_ID']) colors = [] annotations = [] data = pd.DataFrame(index=range(len(qvals))) data['pvalue'] = pvals data['signed_effect'] = signed_effects data['Feature'] = list(subdf[self._colname_feature]) data['Drug'] = list(subdf[self._colname_drugid]) data['text'] = texts.values #data['Assoc'] = assocs ## !! here, we need to use .values since the pandas dataframe # index goes from 1 to N but the origignal indices in subdf # may not be from 1 to N but random between 1 and M>>N data['FDR'] = subdf[self._colname_qvalue].values annotations = [] # just an alias FDR_threshold = self.settings.FDR_threshold if self.settings.analysis_type == 'PANCAN': for sign, qval, pval in zip(signed_effects, qvals, pvals): if sign <= -self.settings.effect_threshold and \ qval <= FDR_threshold: colors.append('green') annotations.append(True) elif sign >= self.settings.effect_threshold and \ qval <= FDR_threshold: colors.append('red') annotations.append(True) else: colors.append('black') annotations.append(False) else: for delta, qval, pval in zip(deltas, qvals, pvals): if pval <= self.settings.pvalue_threshold and \ qval <= FDR_threshold and delta < 0: colors.append('green') annotations.append(True) elif pval <= self.settings.pvalue_threshold and \ qval <= FDR_threshold and delta > 0: colors.append('red') annotations.append(True) else: colors.append('black') annotations.append(False) # here we normalise wrt the drug. In R code, normalised # my max across all data (minN, maxN) colname = self._colname_N_feature_pos markersize = subdf[colname] / subdf[colname].max() markersize = list(markersize * 800) markersize = [x if x > 80 else 80 for x in markersize] data['color'] = colors data['annotation'] = annotations data['markersize'] = markersize return data def volcano_plot_one_feature(self, feature): """Volcano plot for one feature (all drugs) :param feature: a valid feature name to be found in the results """ assert feature in self.features, 'unknown feature name' # FEATURE is the mode's name, not a column's name data = self._get_volcano_sub_data('FEATURE', feature) self._volcano_plot(data, title=feature) def volcano_plot_one_drug(self, drug_id): """Volcano plot for one drug (all genomic features) :param drug_id: a valid drug identifier to be found in the results """ assert drug_id in self.drugs, 'unknown drug name' data = self._get_volcano_sub_data(self._colname_drugid, drug_id) self._volcano_plot(data, title=drug_id) def _volcano_plot(self, data, title=''): """Main volcano plot function called by other methods such as volcano_plot_all""" # This functio is a bit complicated because it does create a few tricky # plots # It creates a volcano plot, which is the easy part # Then, it creates tooltips for the user interface in an IPython # shell using a callback to 'onpick' function coded here below # finally, it creates a Javascript connection using mpld3 that # will allow the creation of a JS version of the plot. # !! There is a memory leak in this function due to matplotlib # This is not easy to track down. # You have to call clf() to make sure the content is erase. # One reason for the memory leak is that it is called in the # Report to loop over all drugs and then all featuers. # To see the memory leak, you will need to call the # volcano_plot_all_drugs function (or volcano_plot_all_features). colors = list(data['color'].values) pvalues = data['pvalue'].values signed_effects = data['signed_effect'].values markersize = data['markersize'].values Y = -np.log10(list(pvalues)) # should be cast to list ? num = 1 #pylab.close(num) fig = pylab.figure(num=1) fig.clf() ax = fig.add_subplot(111) ax.set_axis_bgcolor('#EEEEEE') ax.cla() X = [easydev.precision(x, digit=2) for x in signed_effects] Y = [easydev.precision(y, digit=2) for y in Y] # Using scatter() is slow as compared to plot() # However, plot cannot take different sizes/colors scatter = ax.scatter(X, Y, s=markersize, alpha=0.3, c=colors, linewidth=1, picker=True) scatter.set_zorder(11) m = abs(signed_effects.min()) M = abs(signed_effects.max()) pylab.xlabel("Signed effect size", fontsize=self.settings.fontsize) pylab.ylabel('-log10(pvalues)', fontsize=self.settings.fontsize) l = max([m, M]) * 1.1 pylab.xlim([-l, l]) ax.grid(color='white', linestyle='solid') # some aliases fdr = self.settings.FDR_threshold fdrs = sorted(self.settings.volcano_additional_FDR_lines) fdrs = fdrs[::-1] # reverse sorting styles = ['--', ':', '-.'] if self.settings.volcano_FDR_interpolation is True: get_pvalue_from_fdr = self._get_pvalue_from_fdr_interp else: get_pvalue_from_fdr = self._get_pvalue_from_fdr pvalue = get_pvalue_from_fdr(fdr) ax.axhline(-np.log10(pvalue), linestyle='--', lw=2, color='red', alpha=1, label="FDR %s " % fdr + " \%") for i, this in enumerate(fdrs): if this < self.df[self._colname_qvalue].min() or\ this > self.df[self._colname_qvalue].max(): continue pvalue = get_pvalue_from_fdr(this) ax.axhline(-np.log10(pvalue), linestyle=styles[i], color='red', alpha=1, label="FDR %s " % this + " \%") pylab.ylim([0, pylab.ylim()[1] * 1.2]) # times 1.2 to put the legend ax.axvline(0, color='gray', alpha=0.8, lw=2) axl = pylab.legend(loc='best') axl.set_zorder(10) # in case there is a circle behind the legend. #self.ax = ax #self.axx = ax.twinx() #self.common_ticks = ax.get_yticks() #self.common_ylim = ax.get_ylim() #pvals = self.df[self._colname_pvalue] #y1 = pvals.min() #y2 = pvals.max() #fdr1 = self._get_fdr_from_pvalue_interp(y1) #fdr2 = self._get_fdr_from_pvalue_interp(y2-2e-15) # make sure it exists #self.axx.set_ylim([fdr2, fdr1]) #self.axx.set_ylabel('FDR \%', fontsize=self.settings.fontsize) # For the static version title_handler = pylab.title("%s" % title.replace("_", " "), fontsize=self.settings.fontsize / 1.2) labels = [] # This code allows the ipython user to click on the matplotlib figure # to get information about the drug and feature of a given circles. def onpick(event): ind = event.ind[0] try: title = str(data.ix[ind]['Drug']) + " / " + str( data.ix[ind].Feature) title += "\nFDR=" + "%.4e" % data.ix[ind]['FDR'] title_handler.set_text(title.replace("_", " ")) except: print('Failed to create new title on click') print(data.ix[ind].T) fig.canvas.draw() # keep track on the id for further memory release # For more info search for "matplotlib memory leak mpl_connect" self.cid = fig.canvas.mpl_connect('pick_event', onpick) # for the JS version # TODO: for the first 1 to 2000 entries ? labels = [] self.data = data for i, row in data[['Drug', 'Feature', 'FDR']].iterrows(): template = """ <table border="1" class="dataframe"> <tbody> <tr> <th>Drug</th> <td>%(Drug)s</td> </tr> <tr> <th>Feature</th> <td>%(Feature)s</td> </tr> <tr> <th>FDR</th> <td>%(FDR)s</td> </tr> </tbody> </table>""" % row.to_dict() labels.append(template) # this is more elegant but slower #label = row.to_frame() #label.columns = ['Row {0}'.format(i)] #labels.append(str(label.to_html(header=False))) css = """ svg.mpld3-figure { border: 2px black solid;margin:10px;} table{ font-size:0.8em; } th { color: #ffffff; background-color: #aaaaaa; } td { color: blue; background-color: #cccccc; }""" try: import mpld3 tooltip = mpld3.plugins.PointHTMLTooltip(scatter, labels=labels, css=css) mpld3.plugins.connect(fig, tooltip) except: print("Issue with javascript version of the volcano plot. Skipped") self.scatter = scatter self.current_fig = fig # not sure is this is required. could be a memory leak here import gc gc.collect() def mpld3_to_html(self): """This require to call a plotting figure before hand""" from gdsctools import gdsctools_data # This copy the full path and therefore HTML cannot # be moved in another directory. to be fixed. js_path1 = gdsctools_data('d3.v3.min.js', where='javascript') js_path2 = gdsctools_data('mpld3.v0.2.js', where='javascript') try: # mpld3 is great but there are a couple of issues # 1 - legend zorder is not used so dots may be below the legend, # hence we set the framealpha =0.5 # 2 - % character even though there well interpreted in matploltib # using \%, they are not once parsed by mpld3. So, here # we remove the \ character axl = pylab.legend(loc='best', framealpha=0.8, borderpad=1) axl.set_zorder(10) # in case there is a circle behind the legend. texts = [this.get_text() for this in axl.get_texts()] for i, text in enumerate(texts): text = text.replace("\\%", "%") text += " " axl.get_texts()[i].set_text(text) import mpld3 htmljs = mpld3.fig_to_html(self.current_fig, d3_url=js_path1, mpld3_url=js_path2) except: htmljs = "" return """<div class="jsimage"> """ + htmljs + "</div>" def savefig_and_js(self, filename, size_inches=(10, 10)): # Save the PNG first. The savefig automatically set the size # to a defined set and back to original figsize. self.figtools.savefig(filename + '.png', size_inches=size_inches) # now the javascript. fig = self.current_fig oldsize = fig.get_size_inches() fig.set_size_inches(size_inches) htmljs = self.mpld3_to_html() fh = open(self.settings.directory + os.sep + filename + ".html", "w") fh.write(htmljs) fh.close() fig.set_size_inches(*oldsize)
class ANOVAReport(object): """Class used to interpret the results and create final HTML report Results is a data structure returned by :meth:`ANOVA.anova_all`. :: from gdsctools import * # Perform the analysis itself to get a set of results (dataframe) an = ANOVA(ic50_test) results = an.anova_all() # now, we can create the report. r = ANOVAReport(gdsc=an, results=results) # we can tune some settings r.settings.pvalue_threshold = 0.001 r.settings.FDR_threshold = 28 r.settings.directory = 'testing' r.create_html_pages() .. rubric:: Significant association An association is significant if - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is below 0, otherwise it is **resistant**. """ def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf:0, -np.inf:0}) except: pass # create some data self._set_sensible_df() self.company = None # just to create the directory # ReportMain(directory=self.settings.directory, verbose=self.verbose) def _get_ndrugs(self): return len(self.df[self._colname_drug_id].unique()) n_drugs = property(_get_ndrugs, doc="return number of drugs") def _get_ntests(self): return len(self.df.index) n_tests = property(_get_ntests) def _get_ncelllines(self): return len(self.gdsc.features.df.index) n_celllines = property(_get_ncelllines, doc="return number of cell lines") def _df_append(self, df, data): count = len(df) df.ix[count] = data return df def diagnostics(self): """Return summary of the analysis (dataframe)""" self._set_sensible_df() df = pd.DataFrame({'text': [], 'value': []}) n_features = len(self.gdsc.features.df.columns) n_features -= self.gdsc.features.shift n_drugs = len(self.df[self._colname_drug_id].unique()) N = float(n_drugs * n_features) if N == 0: ratio = 0 else: ratio = float(self.n_tests)/(N) * 100 try: ratio = easydev.precision(ratio, digit=2) except: # Fixme: this is a hack for the infinite values but should not # happen... ratio = 0 msg = "Type of analysis" df = self._df_append(df, [msg, self.settings.analysis_type]) msg = "Total number of possible drug/feature associations" df = self._df_append(df, [msg, int(N)]) msg = "Total number of ANOVA tests performed" df = self._df_append(df, [msg, self.n_tests]) msg = "Percentage of tests performed" df = self._df_append(df, [msg, ratio]) # trick to have an empty line df = self._df_append(df, ["", ""]) msg = "Total number of tested drugs" df = self._df_append(df, [msg, n_drugs]) msg = "Total number of genomic features used" df = self._df_append(df, [msg, n_features]) msg = "Total number of screened cell lines" df = self._df_append(df, [msg, self.n_celllines]) msg = "MicroSatellite instability included as factor" msi = self.settings.include_MSI_factor df = self._df_append(df, [msg, msi]) # trick to have an empty line df = self._df_append(df, ["", ""]) nsens = len(self.sensible_df) nres = len(self.resistant_df) msg = "Total number of significant associations" df = self._df_append(df, [msg, nsens+nres]) msg = " - sensitive" df = self._df_append(df, [msg, nsens]) msg = " - resistant" df = self._df_append(df, [msg, nres]) msg = "p-value significance threshold" df = self._df_append(df, [msg, self.settings.pvalue_threshold]) msg = "FDR significance threshold" df = self._df_append(df, [msg, self.settings.FDR_threshold]) p1, p2 = self._get_pval_range() msg = 'Range of significant p-values' value = "[{:.4}, {:.4}]".format(p1, p2) df = self._df_append(df, [msg, value]) f1, f2 = self._get_fdr_range() msg = "Range of significant % FDRs" value = '[{:.4} {:.4}]'.format(f1, f2) df = self._df_append(df, [msg, value]) return df def _get_pval_range(self): """Get pvalues range of the significant hits""" nsens = len(self.sensible_df) nres = len(self.resistant_df) N = nsens + nres if N == 0: return 0., 0. name = self.varname_pval data = self.df[name].ix[0:N-1] m, M = data.min(), data.max() return m, M def _get_fdr_range(self): """Get FDR range of the significant hits""" name = self.varname_qval data = self.df[name][(self.df[name] < self.settings.FDR_threshold)] if len(data) == 0: return 0., 0. m, M = data.min(), data.max() return m, M def _set_sensible_df(self): # just an alias logand = np.logical_and # select sensible data set mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0 self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)] # select resistant data set mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0 self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)] def get_significant_set(self): """Return significant hits (resistant and sensible)""" # a property that is long to compute # and may change if FDR changes. self._set_sensible_df() df = pd.concat([self.sensible_df, self.resistant_df]) try: df.sort_values('ASSOC_ID', inplace=True) except: df.sort('ASSOC_ID', inplace=True) return df def _get_data(self, df_count_sensible, df_count_resistant): # we can drop all columns except one, which is renamed as count df1 = df_count_sensible['ASSOC_ID'] df1.name = 'sens assoc' df2 = df_count_resistant['ASSOC_ID'] df2.name = 'res assoc' # Now, we join the two TimeSeries (note that above, we selected only # one column so the dataframe was downcast to time series) df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer') # and set NA to zero df_count.fillna(0, inplace=True) # let us add a new column with the total df_count['total'] = df_count['sens assoc'] + df_count['res assoc'] # we want to sort by 'total' column and is equality by the name, # which is the index. So let us add the index temporarily as # a column, sort, and remove 'name' column afterwards df_count['name'] = df_count.index try: df_count.sort_values(by=['total', 'name'], ascending=[False, True], inplace=True) except: df_count.sort(columns=['total', 'name'], ascending=[False, True], inplace=True) df_count.drop('name', axis=1, inplace=True) return df_count def get_drug_summary_data(self): """Return dataframe with drug summary""" # get sensible and resistant sub dataframes self._set_sensible_df() # group by drug colname = self._colname_drug_id df_count_sensible = self.sensible_df.groupby(colname).count() df_count_resistant = self.resistant_df.groupby(colname).count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def drug_summary(self, top=50, fontsize=15, filename=None): """Return dataframe with significant drugs and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_drug_summary_data() if len(df_count): self._plot(df_count, 'drug', top) fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, size_inches=(12, 14), bbox_inches='tight') return df_count def get_feature_summary_data(self): """Return dataframe with feature summary""" # get sensible and resistant sub dataframes self._set_sensible_df() df_count_sensible = self.sensible_df.groupby('FEATURE').count() df_count_resistant = self.resistant_df.groupby('FEATURE').count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def feature_summary(self, filename=None, top=50, fontsize=15): """Return dataframe with significant features and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_feature_summary_data() if len(df_count) > 0: self._plot(df_count, 'feature', top) #fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, set_inches=(12, 14), bbox_inches='tight') return df_count def _plot(self, df_count, title_tag, top): """Used by drug_summary and feature_summary to plot the bar plot""" if top > len(df_count): top = len(df_count) df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']] labels = list(df.index) # add drug name if len(self.drug_decode) > 0: for i, label in enumerate(labels): if title_tag == 'drug': name = self.drug_decode.get_name(label) if name is not None: labels[i] = "{}-{}".format(labels[i], name) else: pass labels = [str(x).replace('_', ' ') for x in labels] # restrict size to first 30 characters labels = [x[0:30] for x in labels] ind = range(0, len(labels)) # reverse does not exist with python3 try: ind.reverse() except: ind = list(ind) ind.reverse() data1 = df['sens assoc'].values data2 = df['res assoc'].values pylab.figure(1) pylab.clf() p1 = pylab.barh(ind, data1, height=0.8, color='purple', label='sensitivity') p2 = pylab.barh(ind, data2, height=0.8, color='orange', left=data1, label='resistance') ax = pylab.gca() self.labels = labels ax.set_yticks([x + 0.5 for x in ind]) ax.set_yticklabels(labels, fontsize=12) xticks = ax.get_xticks() ax.set_xticklabels( [int(x) if divmod(x,1)[1] == 0 else "" for x in xticks]) pylab.grid() pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \ "\nassociated with drug response", fontsize=self.settings.fontsize/1.2) pylab.xlabel(r'Number of significant associations (FDR %s %s %s) ' % ("$>$", self.settings.FDR_threshold, "$\%$"), fontsize=18) M = max(data1+data2) #ax.set_xticks() #ax.set_xticklabels(labels, fontsize=fontsize) ax.set_xlim([0, M+1]) pylab.legend(loc='lower right') try:pylab.tight_layout() except:pass def get_significant_hits(self, show=True): """Return a summary of significant hits :param show: show a plot with the distribution of significant hits .. todo:: to finalise """ fdrs = range(5, 50+1, 5) significants = [] significant_meaningful = [] strong_hits = [] full_strong_hits = [] MC1 = 1 MC2 = 2 mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1 mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2 mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1 mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2 maskMC = mask2 + mask3 + mask4 + mask5 for fdr in fdrs: # significant hits res = self.df['ANOVA_FEATURE_FDR'] < fdr significants.append(res.sum()) # meaningful hits indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr, maskMC) significant_meaningful.append(indices.sum()) # meaningful strong hits mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1 strong_hits.append(np.logical_or(mask1, mask2).sum()) # meaningful full strong hits mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1 full_strong_hits.append(np.logical_and(mask1, mask2).sum()) data = {'significants': significants, 'full_strong_hits': full_strong_hits, 'strong_hits': strong_hits, 'significant_meaningful': significant_meaningful} df = pd.DataFrame(data, columns = ['significants', 'significant_meaningful', 'strong_hits', 'full_strong_hits'], index=fdrs) df.columns = ['1) significant', '2) 1 + meaningful', '3) 2 + strong', '4) 2+ very strong'] if show is True: pylab.clf() ax = pylab.gca() df.plot(kind='bar', width=.8, color=['r', 'gray', 'orange', 'black'], rot=0, ax=ax) pylab.grid() # original is 'aquamarine4','cyan2','cornflowerblue ','aquamarine'), return df def __str__(self): self.df.info() return "" def create_html_associations(self): """Create an HTML page for each significant association The name of the output HTML file is **<association id>.html** where association id is stored in :attr:`df`. """ if self.verbose: print("Creating individual HTML pages for each significant association") df = self.get_significant_set() drugs = df['DRUG_ID'].values features = df['FEATURE'].values assocs = df['ASSOC_ID'].values fdrs = df['ANOVA_FEATURE_FDR'].values N = len(df) pb = Progress(N) html = Association(self, drug='dummy', feature='dummy', fdr='dummy', company=self.company) for i in range(N): html.drug = drugs[i] html.feature = features[i] if str(assocs[i]).startswith("a"): html._filename = str(assocs[i]) + '.html' else: html._filename = "a" + str(assocs[i]) + '.html' html.fdr = fdrs[i] html.assoc_id = assocs[i] #html._init_report() # since we have one shared instance html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n") def create_html_features(self): """Create an HTML page for each significant feature""" df = self.get_significant_set() groups = df.groupby('FEATURE') if self.verbose: print("Creating individual HTML pages for each feature") N = len(groups.indices.keys()) pb = Progress(N) for i, feature in enumerate(groups.indices.keys()): # get the indices and therefore subgroup subdf = groups.get_group(feature) html = HTMLOneFeature(self, self.df, subdf, feature) html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n") def create_html_drugs(self): """Create an HTML page for each drug""" # group by drugs all_drugs = list(self.df['DRUG_ID'].unique()) df = self.get_significant_set() groups = df.groupby('DRUG_ID') if self.verbose: print("Creating individual HTML pages for each drug") N = len(groups.indices.keys()) N = len(all_drugs) pb = Progress(N) for i, drug in enumerate(all_drugs): # enumerate(groups.indices.keys()): # get the indices and therefore subgroup if drug in groups.groups.keys(): subdf = groups.get_group(drug) else: subdf = {} html = HTMLOneDrug(self, self.df, subdf, drug) html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n") def create_html_main(self, onweb=False): """Create HTML main document (summary)""" self._set_sensible_df() if self.verbose: print("Creating main HTML page in directory %s" % (self.settings.directory)) ReportMain(directory=self.settings.directory, verbose=self.verbose) buffer_ = self.settings.savefig self.settings.savefig = True html = HTMLPageMain(self, 'index.html') html._init_report() # created the directory html.create_report(onweb=onweb) self.settings.savefig = buffer_ def create_html_manova(self, onweb=True): """Create summary table with all significant hits :param onweb: open browser with the created HTML page. """ df = self.get_significant_set() page = HTMLPageMANOVA(self, df, self.company) page.create_report(onweb) def create_html_pages(self, onweb=True): """Create all HTML pages""" self.create_html_main(onweb=onweb) self.create_html_manova(onweb=False) self.create_html_drugs() self.create_html_features() self.create_html_associations() def onweb(self): from easydev import onweb onweb(self.settings.directory + os.sep + 'index.html')