def compute_average(self, model_number=None): """Compute the average and update the cnograph accordingly :param int model_number: model_number as shown by :attr:`df.index` if not provided, the average is taken """ if model_number is None: model = self.get_average_model() elif model_number == 'cv': model = self.get_cv_model() else: model = self.df.ix[model_number] # This is to set the average and label and penwidth # TODO: could be simplified using Reaction ? for edge in self.cnograph.edges(data=True): link = edge[2]['link'] if and_symbol not in edge[0] and and_symbol not in edge[1]: if link == "-" : name = "!" + edge[0] + "=" + edge[1] else: name = edge[0] + "=" + edge[1] value = model[name] elif and_symbol in edge[0]: value = model[edge[0]] elif and_symbol in edge[1]: value = model[edge[1]] else: raise ValueError() self.cnograph.edge[edge[0]][edge[1]]["label"] = precision(value) self.cnograph.edge[edge[0]][edge[1]]["average"] = precision(value) # if values are between 0 and 1 M = float(model.max()) self.cnograph.edge[edge[0]][edge[1]]["penwidth"] = precision(value, 2) * 5/M
def __init__(self, drugid, caller): self.drug = int(drugid) self.caller = caller filename = "drug_{0}.html".format(self.drug) super(HTMLOneDrug, self).__init__( directory=caller.output_dir, filename=filename, template_filename='regression.html', init_report=False) self.title = 'Single Drug analysis (%s)' % self.drug self.params = {"drugid": self.drug} self.filename_template = self.caller.prefix + "%(name)s_" + "%s." % self.drug results_filename = self.filename_template % {"name":"results"} + "json" with open(results_filename, "r") as fh: data = json.loads(fh.read()) try:data["bayes"] = easydev.precision(data['bayes'], 3) except:pass try:data["alpha"] = easydev.precision(data['alpha'], 3) except:pass self.params.update(data) self.params['method'] = self.caller.method self.jinja['sections'] = []
def __init__(self, drugid, caller): self.drug = int(drugid) self.caller = caller filename = "drug_{0}.html".format(self.drug) super(HTMLOneDrug, self).__init__(directory=caller.output_dir, filename=filename, template_filename='regression.html', init_report=False) self.title = 'Single Drug analysis (%s)' % self.drug self.params = {"drugid": self.drug} filename_template = self.caller.prefix_data + "%(name)s_" + "%s." % self.drug results_filename = filename_template % {"name": "results"} + "json" with open(results_filename, "r") as fh: data = json.loads(fh.read()) try: data["bayes"] = easydev.precision(data['bayes'], 3) except: pass try: data["alpha"] = easydev.precision(data['alpha'], 5) except: pass try: data["Rp"] = easydev.precision(data['Rp'], 4) except: pass self.params.update(data) self.params['method'] = self.caller.method self.jinja['sections'] = [] self.jinja['goback'] = True
def plot(self, filename, vmin=None, vmax=None, cmap='jet_r'): pylab.clf() pylab.imshow(-np.log10(self.results[self._start_y:,:]), origin="lower", aspect="auto", cmap=cmap, vmin=vmin, vmax=vmax) pylab.colorbar() # Fix xticks XMAX = float(self.results.shape[1]) # The max integer on xaxis xpos = list(range(0, int(XMAX), int(XMAX/5))) xx = [precision(this) for this in np.array(xpos) / XMAX * self.duration] pylab.xticks(xpos, xx, fontsize=16) # Fix yticks YMAX = float(self.results.shape[0]) # The max integer on xaxis ypos = list(range(0, int(YMAX), int(YMAX/5))) yy = [int(this) for this in np.array(ypos) / YMAX * self.sampling] pylab.yticks(ypos, yy, fontsize=16) #pylab.yticks([1000,2000,3000,4000], [5500,11000,16500,22000], fontsize=16) #pylab.title("%s echoes" % filename.replace(".png", ""), fontsize=25) pylab.xlabel("Time (seconds)", fontsize=25) pylab.ylabel("Frequence (Hz)", fontsize=25) pylab.tight_layout() pylab.savefig(filename)
def _get_html_stats(self): from sequana.tools import StatsBAM2Mapped from easydev import precision data = StatsBAM2Mapped(self.directory + "bwa_mem_stats.json").data html = "Reads with Phix: %s %%<br>" % precision(data['contamination'], 3) # add HTML table if "R2_mapped" in data.keys(): df = pd.DataFrame({ 'R1': [data['R1_mapped'], data['R1_unmapped']], 'R2': [data['R2_mapped'], data['R2_unmapped']]}) else: df = pd.DataFrame({ 'R1': [data['R1_mapped'], data['R1_unmapped']]}) df.index = ['mapped', 'unmapped'] datatable = DataTable(df, "bwa_bam") datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'irtpB', "paging": "false", 'buttons': ['copy', 'csv']} js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') #html += "{} {}".format(html_tab, js) html += "Unpaired: %s <br>" % data['unpaired'] html += "duplicated: %s <br>" % data['duplicated'] return html
def _get_html_summary_section(self): from easydev import precision data = self._get_summary() html = "Percentage of reads found with Phix: %s %%<br>" % precision(data['contamination'], 3) #html += "Unpaired: %s <br>" % data['unpaired'] #html += "duplicated: %s <br>" % data['duplicated'] return html
def prec(x): try: # this may fail if for instance x is nan or inf x = easydev.precision(x, self.pd_options['precision']) return x except: return x
def _get_html_summary_section(self): from easydev import precision data = self._get_summary() html = "Percentage of reads found with Phix: %s %%<br>" % precision( data['contamination'], 3) #html += "Unpaired: %s <br>" % data['unpaired'] #html += "duplicated: %s <br>" % data['duplicated'] return html
def multi_mapping(self, fr="ID", to="KEGG_ID", query="P13368", frmt="tab", Nmax=100): """Calls mapping several times and concatenates results .. deprecated: 1.3.1 you can now use :meth:`mapping` even for long queries since we are now using a POST request, which allows arbitrary length of entries. """ self.logging.warning( "deprecated in version 1.3.1. Use mapping instead") if isinstance(query, list) is False: query = [query] unique_entry_names = list(set(query)) if len(unique_entry_names) > Nmax: unique_entry_names = list(unique_entry_names) self.logging.info( "There are more than %s unique species. Using multi stage uniprot mapping" % Nmax) mapping = {} # we need to split # this is a hack rigt now but could be put inside bioservices N, rest = divmod(len(unique_entry_names), Nmax) if rest > 0: N += 1 for i in range(0, N): i1 = i * Nmax i2 = (i + 1) * Nmax if i2 > len(unique_entry_names): i2 = len(unique_entry_names) query = ",".join(unique_entry_names[i1:i2]) this_mapping = self.mapping(fr=fr, to=to, query=query) for k, v in this_mapping.items(): mapping[k] = v from easydev import precision self.logging.info( str(precision((i + 1.) / N * 100., 2)) + "%% completed") else: #query=",".join([x+"_" + species for x in unique_entry_names]) query = ",".join(unique_entry_names) mapping = self.mapping(fr=fr, to=to, query=query) return mapping
def to_html(self): data = self.data html = "Reads with Phix: %s %%<br>" % precision(data['contamination'], 3) # add HTML table if "R2_mapped" in data.keys(): df = pd.DataFrame({ 'R1': [data['R1_mapped'], data['R1_unmapped']], 'R2': [data['R2_mapped'], data['R2_unmapped']]}) else: df = pd.DataFrame({ 'R1': [data['R1_mapped'], data['R1_unmapped']]}) df.index = ['mapped', 'unmapped'] html += "Unpaired: %s <br>" % data['unpaired'] html += "duplicated: %s <br>" % data['duplicated'] return html
def _get_df_with_taxon(self, dbname): # line 14500 # >gi|331784|gb|K01711.1|MEANPCG[331784] Measles virus (strain Edmonston), complete genome df = self.get_taxonomy_biokit([int(x) for x in self.taxons.index]) df['count'] = self.taxons.values df.reset_index(inplace=True) newrow = len(df) df.ix[newrow] = "Unclassified" df.ix[newrow, 'count'] = self.unclassified df.ix[newrow, 'index'] = -1 df.rename(columns={"index": "taxon"}, inplace=True) df["percentage"] = df["count"] / df["count"].sum() * 100 # Now get back all annotations from the database itself. filename = dbname + os.sep + "annotations.csv" if os.path.exists(filename): annotations = pd.read_csv(filename) annotations.set_index("taxon", inplace=True) df2 = annotations.ix[df.taxon][['ena', 'gi', 'description']] # There are duplicates sohow. let us keep the first one for now df2 = df2.reset_index().drop_duplicates( subset="taxon", keep="first").set_index("taxon") self.df2 = df2 self.df1 = df.set_index("taxon") df = pd.merge(self.df1, df2, left_index=True, right_index=True) df.reset_index(inplace=True) starter = ['percentage', 'name', 'ena', 'taxon', "gi", 'count'] df = df[starter + [ x for x in df.columns if x not in starter and x != "description" ] + ["description"]] df['gi'] = [int(x) for x in df['gi'].fillna(-1)] from easydev import precision df['percentage'] = [str(precision(x, 2)) for x in df['percentage']] else: starter = ['taxon', 'count', 'percentage'] df = df[starter + [x for x in df.columns if x not in starter]] df.sort_values(by="percentage", inplace=True, ascending=False) return df
def to_html(self): data = self.data # TODO hardcoded word phix here ? html = "Reads with Phix: %s %%<br>" % precision(data['contamination'], 3) # add HTML table if "R2_mapped" in data.keys(): df = pd.DataFrame({ 'R1': [data['R1_mapped'], data['R1_unmapped']], 'R2': [data['R2_mapped'], data['R2_unmapped']]}) else: df = pd.DataFrame({ 'R1': [data['R1_mapped'], data['R1_unmapped']]}) df.index = ['mapped', 'unmapped'] html += "Unpaired: %s <br>" % data['unpaired'] html += "duplicated: %s <br>" % data['duplicated'] return html
def _get_df_with_taxon(self, dbname): # line 14500 # >gi|331784|gb|K01711.1|MEANPCG[331784] Measles virus (strain Edmonston), complete genome df = self.get_taxonomy_biokit([int(x) for x in self.taxons.index]) df['count'] = self.taxons.values df.reset_index(inplace=True) newrow = len(df) df.ix[newrow] = "Unclassified" df.ix[newrow, 'count'] = self.unclassified df.ix[newrow, 'index'] = -1 df.rename(columns={"index":"taxon"}, inplace=True) df["percentage"] = df["count"] / df["count"].sum() * 100 # Now get back all annotations from the database itself. filename = dbname + os.sep + "annotations.csv" if os.path.exists(filename): annotations = pd.read_csv(filename) annotations.set_index("taxon", inplace=True) df2 = annotations.ix[df.taxon][['ena', 'gi', 'description']] # There are duplicates sohow. let us keep the first one for now df2 = df2.reset_index().drop_duplicates(subset="taxon", keep="first").set_index("taxon") self.df2 = df2 self.df1 = df.set_index("taxon") df = pd.merge(self.df1, df2, left_index=True, right_index=True) df.reset_index(inplace=True) starter = ['percentage', 'name', 'ena', 'taxon', "gi", 'count'] df = df[starter + [x for x in df.columns if x not in starter and x!="description"] + ["description"]] df['gi'] = [int(x) for x in df['gi'].fillna(-1)] from easydev import precision df['percentage'] = [str(precision(x,2)) for x in df['percentage']] else: starter = ['taxon', 'count', 'percentage'] df = df[starter + [x for x in df.columns if x not in starter]] df.sort_values(by="percentage", inplace=True, ascending=False) return df
def multi_mapping(self, fr="ID", to="KEGG_ID", query="P13368", frmt="tab", Nmax=100): """Calls mapping several times and concatenates results .. deprecated: 1.3.1 you can now use :meth:`mapping` even for long queries since we are now using a POST request, which allows arbitrary length of entries. """ self.logging.warning("deprecated in version 1.3.1. Use mapping instead") if isinstance(query, list) is False: query = [query] unique_entry_names = list(set(query)) if len(unique_entry_names) > Nmax: unique_entry_names = list(unique_entry_names) self.logging.info("There are more than %s unique species. Using multi stage uniprot mapping" % Nmax) mapping = {} # we need to split # this is a hack rigt now but could be put inside bioservices N, rest = divmod(len(unique_entry_names), Nmax) if rest>0: N+=1 for i in range(0,N): i1 = i*Nmax i2 = (i+1)*Nmax if i2>len(unique_entry_names): i2 = len(unique_entry_names) query=",".join(unique_entry_names[i1:i2]) this_mapping = self.mapping(fr=fr, to=to, query=query) for k,v in this_mapping.items(): mapping[k] = v from easydev import precision self.logging.info(str(precision((i+1.)/N*100., 2)) + "%% completed") else: #query=",".join([x+"_" + species for x in unique_entry_names]) query=",".join(unique_entry_names) mapping = self.mapping(fr=fr, to=to, query=query) return mapping
def _volcano_plot(self, data, title=''): """Main volcano plot function called by other methods such as volcano_plot_all""" # This functio is a bit complicated because it does create a few tricky # plots # It creates a volcano plot, which is the easy part # Then, it creates tooltips for the user interface in an IPython # shell using a callback to 'onpick' function coded here below # !! There seem to bes a memory leak in this function due to matplotlib # This is not easy to track down and should have no impact now that # ANOVAReport using JS instead of matplotlib data = data.replace(np.inf, 0) data = data.replace(-np.inf, 0) colors = list(data['color'].values) pvalues = data['pvalue'].values signed_effects = data['signed_effect'].values markersize = data['markersize'].values Y = -np.log10(list(pvalues)) # should be cast to list ? num = 1 #pylab.close(num) fig = pylab.figure(num=1) fig.clf() ax = fig.add_subplot(111) ax.set_axis_bgcolor('#EEEEEE') ax.cla() # TODO signed effects may be inf why ? X = [easydev.precision(x, digit=2) for x in signed_effects] Y = [easydev.precision(y, digit=2) for y in Y] # Using scatter() is slow as compared to plot() # However, plot cannot take different sizes/colors scatter = ax.scatter(X, Y, s=markersize, alpha=0.3, c=colors, linewidth=1, picker=True) scatter.set_zorder(11) m = abs(signed_effects.min()) M = abs(signed_effects.max()) pylab.xlabel("Signed effect size", fontsize=self.settings.fontsize) pylab.ylabel('-log10(pvalues)', fontsize=self.settings.fontsize) l = max([m, M]) * 1.1 pylab.xlim([-l, l]) ax.grid(color='white', linestyle='solid') # some aliases fdr = self.settings.FDR_threshold if fdr < self.df[self._colname_qvalue].min(): fdr = self.df[self._colname_qvalue].min() fdrs = sorted(self.settings.volcano_additional_FDR_lines) fdrs = fdrs[::-1] # reverse sorting styles = ['--', ':', '-.'] if self.settings.volcano_FDR_interpolation is True: get_pvalue_from_fdr = self._get_pvalue_from_fdr_interp else: get_pvalue_from_fdr = self._get_pvalue_from_fdr pvalue = get_pvalue_from_fdr(fdr) ax.axhline(-np.log10(pvalue), linestyle='--', lw=2, color='red', alpha=1, label="FDR %s " % fdr + " %") for i, this in enumerate(fdrs): if this < self.df[self._colname_qvalue].min() or\ this > self.df[self._colname_qvalue].max(): continue pvalue = get_pvalue_from_fdr(this) ax.axhline(-np.log10(pvalue), linestyle=styles[i], color='red', alpha=1, label="FDR %s " % this +" %") pylab.ylim([0, pylab.ylim()[1]*1.2]) # times 1.2 to put the legend ax.axvline(0, color='gray', alpha=0.8, lw=2) axl = pylab.legend(loc='best') axl.set_zorder(10) # in case there is a circle behind the legend. #self.ax = ax #self.axx = ax.twinx() #self.common_ticks = ax.get_yticks() #self.common_ylim = ax.get_ylim() #pvals = self.df[self._colname_pvalue] #y1 = pvals.min() #y2 = pvals.max() #fdr1 = self._get_fdr_from_pvalue_interp(y1) #fdr2 = self._get_fdr_from_pvalue_interp(y2-2e-15) # make sure it exists #self.axx.set_ylim([fdr2, fdr1]) #self.axx.set_ylabel('FDR \%', fontsize=self.settings.fontsize) # For the static version title_handler = pylab.title("%s" % str(title).replace("_"," "), fontsize=self.settings.fontsize/1.2) labels = [] # This code allows the ipython user to click on the matplotlib figure # to get information about the drug and feature of a given circles. def onpick(event): ind = event.ind[0] try: title = str(str(data.ix[ind]['Drug'])) + " / " + str(data.ix[ind].Feature) title += "\nFDR=" + "%.4e" % data.ix[ind]['FDR'] title_handler.set_text(title.replace("_"," ")) except: print('Failed to create new title on click') print(data.ix[ind].T) fig.canvas.draw() # keep track on the id for further memory release # For more info search for "matplotlib memory leak mpl_connect" self.cid = fig.canvas.mpl_connect('pick_event', onpick) # for the JS version # TODO: for the first 1 to 2000 entries ? labels = [] self.data = data for i, row in data[['Drug', 'Feature', 'FDR']].iterrows(): template = """ <table border="1" class="dataframe"> <tbody> <tr> <th>Drug</th> <td>%(Drug)s</td> </tr> <tr> <th>Feature</th> <td>%(Feature)s</td> </tr> <tr> <th>FDR</th> <td>%(FDR)s</td> </tr> </tbody> </table>""" % row.to_dict() labels.append(template) # this is more elegant but slower #label = row.to_frame() #label.columns = ['Row {0}'.format(i)] #labels.append(str(label.to_html(header=False))) self.scatter = scatter self.current_fig = fig # not sure is this is required. could be a memory leak here import gc gc.collect()
def _add_patches(self, df, method, fill, ax, diagonal=True): width, height = df.shape labels = (df.columns) patches = [] colors = [] for x in range(width): for y in range(height): if fill == 'lower' and x > y: continue elif fill == 'upper' and x < y: continue if diagonal is False and x==y: continue datum = (df.ix[x, y] +1.)/2. d = df.ix[x, y] d_abs = np.abs(d) #c = self.pvalues[x, y] rotate = -45 if d > 0 else +45 #cmap = self.poscm if d >= 0 else self.negcm if method in ['ellipse', 'square', 'rectangle', 'color']: if method == 'ellipse': func = Ellipse patch = func((x, y), width=1 * self.shrink, height=(self.shrink - d_abs*self.shrink), angle=rotate) else: func = Rectangle w = h = d_abs * self.shrink #FIXME shring must be <=1 offset = (1-w)/2. if method == 'color': w = 1 h = 1 offset = 0 patch = func((x + offset-.5, y + offset-.5), width=w, height=h, angle=0) if self.edgecolor: patch.set_edgecolor(self.edgecolor) #patch.set_facecolor(cmap(d_abs)) colors.append(datum) if d_abs > 0.05: patch.set_linestyle('dotted') #ax.add_artist(patch) patches.append(patch) #FIXME edgecolor is always printed elif method=='circle': patch = Circle((x, y), radius=d_abs*self.shrink/2.) if self.edgecolor: patch.set_edgecolor(self.edgecolor) #patch.set_facecolor(cmap(d_abs)) colors.append(datum) if d_abs > 0.05: patch.set_linestyle('dotted') #ax.add_artist(patch) patches.append(patch) elif method in ['number', 'text']: from easydev import precision #FIXME if d<0: edgecolor = 'red' elif d>0: edgecolor = 'blue' ax.text(x,y, precision(d, 2), color=edgecolor, fontsize=self.fontsize, horizontalalignment='center', weight='bold', alpha=d_abs, withdash=False) elif method == 'pie': S = 360 * d_abs patch = [ Wedge((x,y), 1*self.shrink/2., -90, S-90), Wedge((x,y), 1*self.shrink/2., S-90, 360-90), ] #patch[0].set_facecolor(cmap(d_abs)) #patch[1].set_facecolor('white') colors.append(datum) colors.append(0.5) if self.edgecolor: patch[0].set_edgecolor(self.edgecolor) patch[1].set_edgecolor(self.edgecolor) #ax.add_artist(patch[0]) #ax.add_artist(patch[1]) patches.append(patch[0]) patches.append(patch[1]) if len(patches): col1 = PatchCollection(patches, array=np.array(colors), cmap=self.cm) ax.add_collection(col1) self.collection = col1
def diagnostics(self): """Return summary of the analysis (dataframe)""" self._set_sensible_df() df = pd.DataFrame({'text': [], 'value': []}) n_features = len(self.gdsc.features.df.columns) n_features -= self.gdsc.features.shift n_drugs = len(self.df[self._colname_drug_id].unique()) N = float(n_drugs * n_features) if N == 0: ratio = 0 else: ratio = float(self.n_tests) / (N) * 100 try: ratio = easydev.precision(ratio, digit=2) except: # Fixme: this is a hack for the infinite values but should not # happen... ratio = 0 msg = "Type of analysis" df = self._df_append(df, [msg, self.settings.analysis_type]) msg = "Total number of possible drug/feature associations" df = self._df_append(df, [msg, int(N)]) msg = "Total number of ANOVA tests performed" df = self._df_append(df, [msg, self.n_tests]) msg = "Percentage of tests performed" df = self._df_append(df, [msg, ratio]) # trick to have an empty line df = self._df_append(df, ["", ""]) msg = "Total number of tested drugs" df = self._df_append(df, [msg, n_drugs]) msg = "Total number of genomic features used" df = self._df_append(df, [msg, n_features]) msg = "Total number of screened cell lines" df = self._df_append(df, [msg, self.n_celllines]) msg = "MicroSatellite instability included as factor" msi = self.settings.include_MSI_factor df = self._df_append(df, [msg, msi]) # trick to have an empty line df = self._df_append(df, ["", ""]) nsens = len(self.sensible_df) nres = len(self.resistant_df) msg = "Total number of significant associations" df = self._df_append(df, [msg, nsens + nres]) msg = " - sensitive" df = self._df_append(df, [msg, nsens]) msg = " - resistant" df = self._df_append(df, [msg, nres]) msg = "p-value significance threshold" df = self._df_append(df, [msg, self.settings.pvalue_threshold]) msg = "FDR significance threshold" df = self._df_append(df, [msg, self.settings.FDR_threshold]) p1, p2 = self._get_pval_range() msg = 'Range of significant p-values' value = "[{:.4}, {:.4}]".format(p1, p2) df = self._df_append(df, [msg, value]) f1, f2 = self._get_fdr_range() msg = "Range of significant % FDRs" value = '[{:.4} {:.4}]'.format(f1, f2) df = self._df_append(df, [msg, value]) return df
def _add_patches(self, df, method, fill, ax, diagonal=True): width, height = df.shape labels = (df.columns) patches = [] colors = [] for x in xrange(width): for y in xrange(height): if fill == 'lower' and x > y: continue elif fill == 'upper' and x < y: continue if diagonal is False and x==y: continue datum = (df.ix[x, y] +1.)/2. d = df.ix[x, y] d_abs = np.abs(d) #c = self.pvalues[x, y] rotate = -45 if d > 0 else +45 #cmap = self.poscm if d >= 0 else self.negcm if method in ['ellipse', 'square', 'rectangle', 'color']: if method == 'ellipse': func = Ellipse patch = func((x, y), width=1 * self.shrink, height=(self.shrink - d_abs*self.shrink), angle=rotate) else: func = Rectangle w = h = d_abs * self.shrink #FIXME shring must be <=1 offset = (1-w)/2. if method == 'color': w = 1 h = 1 offset = 0 patch = func((x + offset-.5, y + offset-.5), width=w, height=h, angle=0) if self.edgecolor: patch.set_edgecolor(self.edgecolor) #patch.set_facecolor(cmap(d_abs)) colors.append(datum) if d_abs > 0.05: patch.set_linestyle('dotted') #ax.add_artist(patch) patches.append(patch) #FIXME edgecolor is always printed elif method=='circle': patch = Circle((x, y), radius=d_abs*self.shrink/2.) if self.edgecolor: patch.set_edgecolor(self.edgecolor) #patch.set_facecolor(cmap(d_abs)) colors.append(datum) if d_abs > 0.05: patch.set_linestyle('dotted') #ax.add_artist(patch) patches.append(patch) elif method in ['number', 'text']: from easydev import precision if d<0: edgecolor = 'red' elif d>=0: edgecolor = 'blue' ax.text(x,y, precision(d, 2), color=edgecolor, fontsize=self.fontsize, horizontalalignment='center', weight='bold', alpha=d_abs, withdash=False) elif method == 'pie': S = 360 * d_abs patch = [ Wedge((x,y), 1*self.shrink/2., -90, S-90), Wedge((x,y), 1*self.shrink/2., S-90, 360-90), ] #patch[0].set_facecolor(cmap(d_abs)) #patch[1].set_facecolor('white') colors.append(datum) colors.append(0.5) if self.edgecolor: patch[0].set_edgecolor(self.edgecolor) patch[1].set_edgecolor(self.edgecolor) #ax.add_artist(patch[0]) #ax.add_artist(patch[1]) patches.append(patch[0]) patches.append(patch[1]) if len(patches): col1 = PatchCollection(patches, array=np.array(colors), cmap=self.cm) ax.add_collection(col1) self.collection = col1
def _volcano_plot(self, data, title=''): """Main volcano plot function called by other methods such as volcano_plot_all""" # This functio is a bit complicated because it does create a few tricky # plots # It creates a volcano plot, which is the easy part # Then, it creates tooltips for the user interface in an IPython # shell using a callback to 'onpick' function coded here below # finally, it creates a Javascript connection using mpld3 that # will allow the creation of a JS version of the plot. # !! There is a memory leak in this function due to matplotlib # This is not easy to track down. # You have to call clf() to make sure the content is erase. # One reason for the memory leak is that it is called in the # Report to loop over all drugs and then all featuers. # To see the memory leak, you will need to call the # volcano_plot_all_drugs function (or volcano_plot_all_features). colors = list(data['color'].values) pvalues = data['pvalue'].values signed_effects = data['signed_effect'].values markersize = data['markersize'].values Y = -np.log10(list(pvalues)) # should be cast to list ? num = 1 #pylab.close(num) fig = pylab.figure(num=1) fig.clf() ax = fig.add_subplot(111) ax.set_axis_bgcolor('#EEEEEE') ax.cla() X = [easydev.precision(x, digit=2) for x in signed_effects] Y = [easydev.precision(y, digit=2) for y in Y] # Using scatter() is slow as compared to plot() # However, plot cannot take different sizes/colors scatter = ax.scatter(X, Y, s=markersize, alpha=0.3, c=colors, linewidth=1, picker=True) scatter.set_zorder(11) m = abs(signed_effects.min()) M = abs(signed_effects.max()) pylab.xlabel("Signed effect size", fontsize=self.settings.fontsize) pylab.ylabel('-log10(pvalues)', fontsize=self.settings.fontsize) l = max([m, M]) * 1.1 pylab.xlim([-l, l]) ax.grid(color='white', linestyle='solid') # some aliases fdr = self.settings.FDR_threshold fdrs = sorted(self.settings.volcano_additional_FDR_lines) fdrs = fdrs[::-1] # reverse sorting styles = ['--', ':', '-.'] if self.settings.volcano_FDR_interpolation is True: get_pvalue_from_fdr = self._get_pvalue_from_fdr_interp else: get_pvalue_from_fdr = self._get_pvalue_from_fdr pvalue = get_pvalue_from_fdr(fdr) ax.axhline(-np.log10(pvalue), linestyle='--', lw=2, color='red', alpha=1, label="FDR %s " % fdr + " \%") for i, this in enumerate(fdrs): if this < self.df[self._colname_qvalue].min() or\ this > self.df[self._colname_qvalue].max(): continue pvalue = get_pvalue_from_fdr(this) ax.axhline(-np.log10(pvalue), linestyle=styles[i], color='red', alpha=1, label="FDR %s " % this + " \%") pylab.ylim([0, pylab.ylim()[1] * 1.2]) # times 1.2 to put the legend ax.axvline(0, color='gray', alpha=0.8, lw=2) axl = pylab.legend(loc='best') axl.set_zorder(10) # in case there is a circle behind the legend. #self.ax = ax #self.axx = ax.twinx() #self.common_ticks = ax.get_yticks() #self.common_ylim = ax.get_ylim() #pvals = self.df[self._colname_pvalue] #y1 = pvals.min() #y2 = pvals.max() #fdr1 = self._get_fdr_from_pvalue_interp(y1) #fdr2 = self._get_fdr_from_pvalue_interp(y2-2e-15) # make sure it exists #self.axx.set_ylim([fdr2, fdr1]) #self.axx.set_ylabel('FDR \%', fontsize=self.settings.fontsize) # For the static version title_handler = pylab.title("%s" % title.replace("_", " "), fontsize=self.settings.fontsize / 1.2) labels = [] # This code allows the ipython user to click on the matplotlib figure # to get information about the drug and feature of a given circles. def onpick(event): ind = event.ind[0] try: title = str(data.ix[ind]['Drug']) + " / " + str( data.ix[ind].Feature) title += "\nFDR=" + "%.4e" % data.ix[ind]['FDR'] title_handler.set_text(title.replace("_", " ")) except: print('Failed to create new title on click') print(data.ix[ind].T) fig.canvas.draw() # keep track on the id for further memory release # For more info search for "matplotlib memory leak mpl_connect" self.cid = fig.canvas.mpl_connect('pick_event', onpick) # for the JS version # TODO: for the first 1 to 2000 entries ? labels = [] self.data = data for i, row in data[['Drug', 'Feature', 'FDR']].iterrows(): template = """ <table border="1" class="dataframe"> <tbody> <tr> <th>Drug</th> <td>%(Drug)s</td> </tr> <tr> <th>Feature</th> <td>%(Feature)s</td> </tr> <tr> <th>FDR</th> <td>%(FDR)s</td> </tr> </tbody> </table>""" % row.to_dict() labels.append(template) # this is more elegant but slower #label = row.to_frame() #label.columns = ['Row {0}'.format(i)] #labels.append(str(label.to_html(header=False))) css = """ svg.mpld3-figure { border: 2px black solid;margin:10px;} table{ font-size:0.8em; } th { color: #ffffff; background-color: #aaaaaa; } td { color: blue; background-color: #cccccc; }""" try: import mpld3 tooltip = mpld3.plugins.PointHTMLTooltip(scatter, labels=labels, css=css) mpld3.plugins.connect(fig, tooltip) except: print("Issue with javascript version of the volcano plot. Skipped") self.scatter = scatter self.current_fig = fig # not sure is this is required. could be a memory leak here import gc gc.collect()
def diagnostics(self): """Return summary of the analysis (dataframe)""" self._set_sensible_df() df = pd.DataFrame({'text': [], 'value': []}) n_features = len(self.gdsc.features.df.columns) n_features -= self.gdsc.features.shift n_drugs = len(self.df[self._colname_drug_id].unique()) N = float(n_drugs * n_features) if N == 0: ratio = 0 else: ratio = float(self.n_tests)/(N) * 100 try: ratio = easydev.precision(ratio, digit=2) except: # Fixme: this is a hack for the infinite values but should not # happen... ratio = 0 msg = "Type of analysis" df = self._df_append(df, [msg, self.settings.analysis_type]) msg = "Total number of possible drug/feature associations" df = self._df_append(df, [msg, int(N)]) msg = "Total number of ANOVA tests performed" df = self._df_append(df, [msg, self.n_tests]) msg = "Percentage of tests performed" df = self._df_append(df, [msg, ratio]) # trick to have an empty line df = self._df_append(df, ["", ""]) msg = "Total number of tested drugs" df = self._df_append(df, [msg, n_drugs]) msg = "Total number of genomic features used" df = self._df_append(df, [msg, n_features]) msg = "Total number of screened cell lines" df = self._df_append(df, [msg, self.n_celllines]) msg = "MicroSatellite instability included as factor" msi = self.settings.include_MSI_factor df = self._df_append(df, [msg, msi]) # trick to have an empty line df = self._df_append(df, ["", ""]) nsens = len(self.sensible_df) nres = len(self.resistant_df) msg = "Total number of significant associations" df = self._df_append(df, [msg, nsens+nres]) msg = " - sensitive" df = self._df_append(df, [msg, nsens]) msg = " - resistant" df = self._df_append(df, [msg, nres]) msg = "p-value significance threshold" df = self._df_append(df, [msg, self.settings.pvalue_threshold]) msg = "FDR significance threshold" df = self._df_append(df, [msg, self.settings.FDR_threshold]) p1, p2 = self._get_pval_range() msg = 'Range of significant p-values' value = "[{:.4}, {:.4}]".format(p1, p2) df = self._df_append(df, [msg, value]) f1, f2 = self._get_fdr_range() msg = "Range of significant % FDRs" value = '[{:.4} {:.4}]'.format(f1, f2) df = self._df_append(df, [msg, value]) return df