def _create_report(self, onweb=True): # generated pictures and results df = self.run() # Create the table and add it sign = ANOVAResults(df) html_table = sign.get_html_table(escape=False, header=True, index=False) self.jinja['association_table'] = html_table # Main boxplot always included prefix = 'images/ODOF_all' tag = "{0}_{1}____{2}.png".format(prefix, self.drug, self.feature) section = '<img alt="association {0}" src="{0}">\n'.format(tag) if self.factory.settings.include_MSI_factor: prefix = 'ODOF_msi' tag = "{0}_{1}____{2}.png".format(prefix, self.drug, self.feature) section += '<img alt="association {0}" src="{0}">\n'.format(tag) if self.factory.settings.analysis_type == 'PANCAN': prefix = 'ODOF_tissue' tag = "{0}_{1}____{2}.png".format(prefix, self.drug, self.feature) section += '<img alt="association {0}" src="{0}">\n'.format(tag) self.jinja['boxplots'] = section
def _create_report(self, onweb=True): self.jinja['N_hits'] = len(self.subdf) if len(self.subdf) > 0: sign = ANOVAResults(self.subdf) html = sign.get_html_table(escape=False, header=True, index=False) self.jinja['association_table'] = html self.jinja['volcano_jsdata'] = self.create_pictures()
def _create_report(self, onweb=True): # generated pictures and results df = self.run() odof = self.factory._get_one_drug_one_feature_data( self.drug, self.feature) # Create the table and add it sign = ANOVAResults(df) html_table = sign.get_html_table(escape=False, header=True, index=False, add_href=self.add_href) self.jinja['association_table'] = html_table # Javascript version bx = BoxPlotsJS(odof) self.jinja["boxplot_all_jsdata"] = bx.get_html_association() section = "" if self.factory.settings.include_MSI_factor: self.jinja["boxplot_msi_jsdata"] = bx.get_html_msi() if self.factory.settings.include_media_factor: self.jinja["boxplot_media_jsdata"] = bx.get_html_media() if self.factory.settings.analysis_type == 'PANCAN': self.jinja["boxplot_tissue_jsdata"] = bx.get_html_tissue() self.jinja['boxplots'] = section
def _create_report(self, onweb=True): # generated pictures and results df = self.run() odof = self.factory._get_one_drug_one_feature_data(self.drug, self.feature) # Create the table and add it sign = ANOVAResults(df) html_table = sign.get_html_table(escape=False, header=True, index=False) self.jinja['association_table'] = html_table # Javascript version bx = BoxPlotsJS(odof) self.jinja["boxplot_all_jsdata"] = bx.get_html_association() section = "" if self.factory.settings.include_MSI_factor: self.jinja["boxplot_msi_jsdata"] = bx.get_html_msi() if self.factory.settings.include_media_factor: self.jinja["boxplot_media_jsdata"] = bx.get_html_media() if self.factory.settings.analysis_type == 'PANCAN': self.jinja["boxplot_tissue_jsdata"] = bx.get_html_tissue() self.jinja['boxplots'] = section
def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf: 0, -np.inf: 0}) except: pass # create some data self._set_sensible_df() self.company = None
def _create_report(self, onweb=True): self.create_pictures() self.jinja['N_hits'] = len(self.subdf) if len(self.subdf) > 0: sign = ANOVAResults(self.subdf) html = sign.get_html_table(escape=False, header=True, index=False) self.jinja['association_table'] = html # image section self.jinja['image_filename'] = "images/volcano_{0}".format(self.feature)
def _create_report(self, onweb=True): self.create_pictures() self.jinja['N_hits'] = len(self.subdf) if len(self.subdf) > 0: sign = ANOVAResults(self.subdf) html = sign.get_html_table(escape=False, header=True, index=False) self.jinja['association_table'] = html # image section self.jinja['image_filename'] = "images/volcano_{0}".format( self.feature)
def load_results(self): """Find the files results.csv in all TCGA directories""" for tcga in self.tcga: print(tcga) self.results[tcga] = ANOVAResults('ALL' + os.sep + tcga + os.sep + 'OUTPUT' + os.sep + 'results.csv')
def _create_report(self, onweb=True): #self.create_pictures() # add the table self.jinja['synonyms'] = '' self.jinja['brand_name'] = '' self.jinja['conc_min'] = '?' self.jinja['conc_max'] = '?' # Table section self.jinja['N_hits'] = len(self.subdf) if len(self.subdf) > 0: sign = ANOVAResults(self.subdf) html = sign.get_html_table(escape=False, header=True, index=False) self.jinja['association_table'] = html # image section self.jinja['volcano_jsdata'] = self.create_pictures()
def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf:0, -np.inf:0}) except: pass # create some data self._set_sensible_df() self.company = None
def __init__(self, gdsc, results, sep="\t", drug_decode=None): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all` """ self.figtools = Savefig() self.gdsc = gdsc self.df = ANOVAResults(results).df # this does a copy and sanity check self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) # create some data self._set_sensible_df() # just to create the directory ReportMAIN(directory=self.settings.directory)
def __init__(self, gdsc, df): """.. rubric:: constructor :param : an ANOVA instance. :param directory: where to save the file The HTML filename is stored in the :attr:`filename`, which can be changes (default is manova.html) """ super(HTMLPageMANOVA, self).__init__(filename='manova.html', directory=gdsc.settings.directory, template_filename='manova.html') html = ANOVAResults(df).get_html_table(collapse_table=False) self.jinja['manova'] = html self.jinja['analysis_domain'] = gdsc.settings.analysis_type
def __init__(self, report, df, company): """.. rubric:: constructor :param : an ANOVA instance. :param directory: where to save the file The HTML filename is stored in the :attr:`filename`, which can be changes (default is manova.html) """ super(HTMLPageMANOVA, self).__init__( filename='manova.html', directory=report.settings.directory + os.sep + "associations", template_filename='manova.html', init_report=False) html = ANOVAResults(df).get_html_table(collapse_table=False) self.jinja['manova'] = html self.jinja['analysis_domain'] = report.settings.analysis_type self.jinja['resource_path'] = ".." self.jinja["collaborator"] = company
def create_data_packages_for_companies(self, companies=None): ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies Ncomp = len(companies) for ii, company in enumerate(companies): print("\n\n========= Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company)) self.mkdir(company) for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print("---------------- for TCGA %s" % tcga) # Read the results previously computed try: results_df = self.results[tcga].df.copy() except: results_path = "ALL/%s/OUTPUT/results.csv" % tcga print("Downloading results from %s" % results_path) results_df = ANOVAResults(results_path) results = ANOVAResults(results_df) # Get a DrugDecode for that company drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # filter results using the new drug decode drug_ids_in_results = get_drug_id(results.df.DRUG_ID) mask = [ True if x in drug_decode_company.df.index else False for x in drug_ids_in_results ] results.df = results.df.ix[mask] # Just to create an instance with the subset of drug_decode # and correct settings. This is also used to store # the entire input data set. So, we must remove all drugs # not relevant for the analysis of this company an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company) def drug_to_keep(drug): to_keep = get_drug_id(drug) in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = company + os.sep + tcga an.settings.analysis_type = tcga self.report = ANOVAReport(an, results) self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) if self.debug is False: self.report.create_html_features() self.report.create_html_associations() # For now, we just copy all DRUG images from # the analysis made in ALL from easydev import shellcmd, Progress print("\nCopying drug files") drug_ids = results.df.DRUG_ID.unique() pb = Progress(len(drug_ids)) for i, drug_id in enumerate(drug_ids): # copy the HTML filename = "%s.html" % drug_id source = "ALL%s%s%s" % (os.sep, tcga, os.sep) dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) #copy the images filename = "volcano_%s.*" % drug_id source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep, os.sep) dest = "%s%s%s%simages%s" % (company, os.sep, tcga, os.sep, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) pb.animate(i + 1)
def create_summary_pages(self, main_directory='ALL'): # Read in ALL all directories # create directories and copy relevant files self.mkdir(main_directory + os.sep + 'images') self.mkdir(main_directory + os.sep + 'css') self.mkdir(main_directory + os.sep + 'js') from gdsctools import gdsctools_data for filename in ['gdsc.css', 'github-gist.css']: target = os.sep.join([main_directory, 'css', filename]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['highlight.pack.js']: target = os.sep.join([main_directory, 'js', filename]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['EBI_logo.png', 'sanger-logo.png']: target = os.sep.join([main_directory, 'images', filename]) if os.path.isfile(target) is False: dire = 'data' + os.sep + 'images' filename = gdsctools_data("images" + os.sep + filename) shutil.copy(filename, target) directories = glob.glob('ALL' + os.sep + '*') directories = [x for x in directories if os.path.isdir(x)] summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[1] if tcga in ['css', 'images']: continue # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = get_drug_id(hits['Unnamed: 0'].unique()) results = ANOVAResults(path + 'results.csv') if len(results) > 0: drug_ids = get_drug_id(results.df.DRUG_ID.unique()) else: drug_ids = [] path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([ tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public'] ]) df = pd.DataFrame(summary) df.columns = [ 'Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of' ] # FIXME include css and images of logo # FIXME save in the proper directory output_dir = main_directory + os.sep + '..' + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMAIN(directory='ALL', filename='index.html', template_filename='datapack_summary.html') # Let us use our HTMLTable to add the HTML references from gdsctools.report import HTMLTable self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') #html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html() self.html_page.write() return df
def create_data_packages_for_companies(self, companies=None): ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies Ncomp = len(companies) for ii, company in enumerate(companies): print("\n\n========= Analysing company %s out of %s (%s)" % (ii+1, Ncomp, company)) self.mkdir(company) for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print("---------------- for TCGA %s" % tcga) # Read the results previously computed try: results_df = self.results[tcga].df.copy() except: results_path = "ALL/%s/OUTPUT/results.csv" % tcga print("Downloading results from %s" % results_path) results_df = ANOVAResults(results_path) results = ANOVAResults(results_df) # Get a DrugDecode for that company drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # filter results using the new drug decode drug_ids_in_results = get_drug_id(results.df.DRUG_ID) mask = [True if x in drug_decode_company.df.index else False for x in drug_ids_in_results] results.df = results.df.ix[mask] # Just to create an instance with the subset of drug_decode # and correct settings. This is also used to store # the entire input data set. So, we must remove all drugs # not relevant for the analysis of this company an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company) def drug_to_keep(drug): to_keep = get_drug_id(drug) in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = company + os.sep + tcga an.settings.analysis_type = tcga self.report = ANOVAReport(an, results) self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) if self.debug is False: self.report.create_html_features() self.report.create_html_associations() # For now, we just copy all DRUG images from # the analysis made in ALL from easydev import shellcmd, Progress print("\nCopying drug files") drug_ids = results.df.DRUG_ID.unique() pb = Progress(len(drug_ids)) for i, drug_id in enumerate(drug_ids): # copy the HTML filename = "%s.html" % drug_id source = "ALL%s%s%s" % (os.sep, tcga, os.sep) dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep ) cmd = "cp %s%s %s" % (source, filename, dest ) shellcmd(cmd, verbose=False) #copy the images filename = "volcano_%s.*" % drug_id source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep, os.sep) dest = "%s%s%s%simages%s" % (company, os.sep, tcga, os.sep , os.sep) cmd = "cp %s%s %s" % (source, filename, dest ) shellcmd(cmd, verbose=False) pb.animate(i+1)
class ANOVAReport(object): """Class used to interpret the results and create final HTML report Results is a data structure returned by :meth:`ANOVA.anova_all`. :: from gdsctools import * # Perform the analysis itself to get a set of results (dataframe) an = ANOVA(ic50_test) results = an.anova_all() # now, we can create the report. r = ANOVAReport(gdsc=an, results=results) # we can tune some settings r.settings.pvalue_threshold = 0.001 r.settings.FDR_threshold = 28 r.settings.directory = 'testing' r.create_html_pages() .. rubric:: Significant association An association is significant if - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is below 0, otherwise it is **resistant**. """ def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf: 0, -np.inf: 0}) except: pass # create some data self._set_sensible_df() self.company = None # just to create the directory # ReportMain(directory=self.settings.directory, verbose=self.verbose) def _get_ndrugs(self): return len(self.df[self._colname_drug_id].unique()) n_drugs = property(_get_ndrugs, doc="return number of drugs") def _get_ntests(self): return len(self.df.index) n_tests = property(_get_ntests) def _get_ncelllines(self): return len(self.gdsc.features.df.index) n_celllines = property(_get_ncelllines, doc="return number of cell lines") def _df_append(self, df, data): count = len(df) df.loc[count] = data return df def diagnostics(self): """Return summary of the analysis (dataframe)""" self._set_sensible_df() df = pd.DataFrame({'text': [], 'value': []}) n_features = len(self.gdsc.features.df.columns) n_features -= self.gdsc.features.shift n_drugs = len(self.df[self._colname_drug_id].unique()) N = float(n_drugs * n_features) if N == 0: ratio = 0 else: ratio = float(self.n_tests) / (N) * 100 try: ratio = easydev.precision(ratio, digit=2) except: # Fixme: this is a hack for the infinite values but should not # happen... ratio = 0 msg = "Type of analysis" df = self._df_append(df, [msg, self.settings.analysis_type]) msg = "Total number of possible drug/feature associations" df = self._df_append(df, [msg, int(N)]) msg = "Total number of ANOVA tests performed" df = self._df_append(df, [msg, self.n_tests]) msg = "Percentage of tests performed" df = self._df_append(df, [msg, ratio]) # trick to have an empty line df = self._df_append(df, ["", ""]) msg = "Total number of tested drugs" df = self._df_append(df, [msg, n_drugs]) msg = "Total number of genomic features used" df = self._df_append(df, [msg, n_features]) msg = "Total number of screened cell lines" df = self._df_append(df, [msg, self.n_celllines]) msg = "MicroSatellite instability included as factor" msi = self.settings.include_MSI_factor df = self._df_append(df, [msg, msi]) # trick to have an empty line df = self._df_append(df, ["", ""]) nsens = len(self.sensible_df) nres = len(self.resistant_df) msg = "Total number of significant associations" df = self._df_append(df, [msg, nsens + nres]) msg = " - sensitive" df = self._df_append(df, [msg, nsens]) msg = " - resistant" df = self._df_append(df, [msg, nres]) msg = "p-value significance threshold" df = self._df_append(df, [msg, self.settings.pvalue_threshold]) msg = "FDR significance threshold" df = self._df_append(df, [msg, self.settings.FDR_threshold]) p1, p2 = self._get_pval_range() msg = 'Range of significant p-values' value = "[{:.4}, {:.4}]".format(p1, p2) df = self._df_append(df, [msg, value]) f1, f2 = self._get_fdr_range() msg = "Range of significant % FDRs" value = '[{:.4} {:.4}]'.format(f1, f2) df = self._df_append(df, [msg, value]) return df def _get_pval_range(self): """Get pvalues range of the significant hits""" nsens = len(self.sensible_df) nres = len(self.resistant_df) N = nsens + nres if N == 0: return 0., 0. name = self.varname_pval data = self.df[name].iloc[0:N] m, M = data.min(), data.max() return m, M def _get_fdr_range(self): """Get FDR range of the significant hits""" name = self.varname_qval data = self.df[name][(self.df[name] < self.settings.FDR_threshold)] if len(data) == 0: return 0., 0. m, M = data.min(), data.max() return m, M def _set_sensible_df(self): # just an alias logand = np.logical_and # select sensible data set mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0 self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)] # select resistant data set mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0 self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)] def get_significant_set(self): """Return significant hits (resistant and sensible)""" # a property that is long to compute # and may change if FDR changes. self._set_sensible_df() df = pd.concat([self.sensible_df, self.resistant_df]) try: df.sort_values('ASSOC_ID', inplace=True) except: df.sort('ASSOC_ID', inplace=True) return df def _get_data(self, df_count_sensible, df_count_resistant): # we can drop all columns except one, which is renamed as count df1 = df_count_sensible['ASSOC_ID'] df1.name = 'sens assoc' df2 = df_count_resistant['ASSOC_ID'] df2.name = 'res assoc' # Now, we join the two TimeSeries (note that above, we selected only # one column so the dataframe was downcast to time series) df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer') # and set NA to zero df_count.fillna(0, inplace=True) # let us add a new column with the total df_count['total'] = df_count['sens assoc'] + df_count['res assoc'] # we want to sort by 'total' column and is equality by the name, # which is the index. So let us add the index temporarily as # a column, sort, and remove 'name' column afterwards df_count['name'] = df_count.index try: df_count.sort_values(by=['total', 'name'], ascending=[False, True], inplace=True) except: df_count.sort(columns=['total', 'name'], ascending=[False, True], inplace=True) df_count.drop('name', axis=1, inplace=True) return df_count def get_drug_summary_data(self): """Return dataframe with drug summary""" # get sensible and resistant sub dataframes self._set_sensible_df() # group by drug colname = self._colname_drug_id df_count_sensible = self.sensible_df.groupby(colname).count() df_count_resistant = self.resistant_df.groupby(colname).count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def drug_summary(self, top=50, fontsize=15, filename=None): """Return dataframe with significant drugs and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_drug_summary_data() if len(df_count): self._plot(df_count, 'drug', top) fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, size_inches=(12, 14), bbox_inches='tight') return df_count def get_feature_summary_data(self): """Return dataframe with feature summary""" # get sensible and resistant sub dataframes self._set_sensible_df() df_count_sensible = self.sensible_df.groupby('FEATURE').count() df_count_resistant = self.resistant_df.groupby('FEATURE').count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def feature_summary(self, filename=None, top=50, fontsize=15): """Return dataframe with significant features and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_feature_summary_data() if len(df_count) > 0: self._plot(df_count, 'feature', top) #fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, set_inches=(12, 14), bbox_inches='tight') return df_count def _plot(self, df_count, title_tag, top): """Used by drug_summary and feature_summary to plot the bar plot""" if top > len(df_count): top = len(df_count) df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']] labels = list(df.index) # add drug name if len(self.drug_decode) > 0: for i, label in enumerate(labels): if title_tag == 'drug': name = self.drug_decode.get_name(label) if name is not None: labels[i] = "{}-{}".format(labels[i], name) else: pass labels = [str(x).replace('_', ' ') for x in labels] # restrict size to first 30 characters labels = [x[0:30] for x in labels] ind = range(0, len(labels)) # reverse does not exist with python3 try: ind.reverse() except: ind = list(ind) ind.reverse() data1 = df['sens assoc'].values data2 = df['res assoc'].values pylab.figure(1) pylab.clf() p1 = pylab.barh(ind, data1, height=0.8, color='purple', label='sensitivity') p2 = pylab.barh(ind, data2, height=0.8, color='orange', left=data1, label='resistance') ax = pylab.gca() self.labels = labels ax.set_yticks([x + 0.5 for x in ind]) ax.set_yticklabels(labels, fontsize=12) xticks = ax.get_xticks() ax.set_xticklabels( [int(x) if divmod(x, 1)[1] == 0 else "" for x in xticks]) pylab.grid() pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \ "\nassociated with drug response", fontsize=self.settings.fontsize/1.2) pylab.xlabel(r'Number of significant associations (FDR %s %s %s) ' % ("$>$", self.settings.FDR_threshold, "$\%$"), fontsize=16) M = max(data1 + data2) #ax.set_xticks() #ax.set_xticklabels(labels, fontsize=fontsize) ax.set_xlim([0, M + 1]) pylab.legend(loc='lower right') try: pylab.tight_layout() except: pass def get_significant_hits(self, show=True): """Return a summary of significant hits :param show: show a plot with the distribution of significant hits .. todo:: to finalise """ fdrs = range(5, 50 + 1, 5) significants = [] significant_meaningful = [] strong_hits = [] full_strong_hits = [] MC1 = 1 MC2 = 2 mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1 mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2 mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1 mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2 maskMC = mask2 + mask3 + mask4 + mask5 for fdr in fdrs: # significant hits res = self.df['ANOVA_FEATURE_FDR'] < fdr significants.append(res.sum()) # meaningful hits indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr, maskMC) significant_meaningful.append(indices.sum()) # meaningful strong hits mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1 strong_hits.append(np.logical_or(mask1, mask2).sum()) # meaningful full strong hits mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1 full_strong_hits.append(np.logical_and(mask1, mask2).sum()) data = { 'significants': significants, 'full_strong_hits': full_strong_hits, 'strong_hits': strong_hits, 'significant_meaningful': significant_meaningful } df = pd.DataFrame(data, columns=[ 'significants', 'significant_meaningful', 'strong_hits', 'full_strong_hits' ], index=fdrs) df.columns = [ '1) significant', '2) 1 + meaningful', '3) 2 + strong', '4) 2+ very strong' ] if show is True: pylab.clf() ax = pylab.gca() df.plot(kind='bar', width=.8, color=['r', 'gray', 'orange', 'black'], rot=0, ax=ax) pylab.grid() # original is 'aquamarine4','cyan2','cornflowerblue ','aquamarine'), return df def __str__(self): self.df.info() return "" def create_html_associations(self): """Create an HTML page for each significant association The name of the output HTML file is **<association id>.html** where association id is stored in :attr:`df`. """ if self.verbose: print( "Creating individual HTML pages for each significant association" ) df = self.get_significant_set() drugs = df['DRUG_ID'].values features = df['FEATURE'].values assocs = df['ASSOC_ID'].values fdrs = df['ANOVA_FEATURE_FDR'].values N = len(df) pb = Progress(N) html = Association(self, drug='dummy', feature='dummy', fdr='dummy', company=self.company) for i in range(N): html.drug = drugs[i] html.feature = features[i] if str(assocs[i]).startswith("a"): html._filename = str(assocs[i]) + '.html' else: html._filename = "a" + str(assocs[i]) + '.html' html.fdr = fdrs[i] html.assoc_id = assocs[i] #html._init_report() # since we have one shared instance html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n") def create_html_features(self): """Create an HTML page for each significant feature""" df = self.get_significant_set() groups = df.groupby('FEATURE') if self.verbose: print("Creating individual HTML pages for each feature") N = len(groups.indices.keys()) pb = Progress(N) for i, feature in enumerate(groups.indices.keys()): # get the indices and therefore subgroup subdf = groups.get_group(feature) html = HTMLOneFeature(self, self.df, subdf, feature) html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n") def create_html_drugs(self): """Create an HTML page for each drug""" # group by drugs all_drugs = list(self.df['DRUG_ID'].unique()) df = self.get_significant_set() groups = df.groupby('DRUG_ID') if self.verbose: print("Creating individual HTML pages for each drug") N = len(groups.indices.keys()) N = len(all_drugs) pb = Progress(N) for i, drug in enumerate(all_drugs): # enumerate(groups.indices.keys()): # get the indices and therefore subgroup if drug in groups.groups.keys(): subdf = groups.get_group(drug) else: subdf = {} html = HTMLOneDrug(self, self.df, subdf, drug) html.create_report(onweb=False) if self.settings.animate: pb.animate(i + 1) if self.settings.animate: print("\n") def create_html_main(self, onweb=False): """Create HTML main document (summary)""" self._set_sensible_df() if self.verbose: print("Creating main HTML page in directory %s" % (self.settings.directory)) ReportMain(directory=self.settings.directory, verbose=self.verbose) buffer_ = self.settings.savefig self.settings.savefig = True html = HTMLPageMain(self, 'index.html') html._init_report() # created the directory html.create_report(onweb=onweb) self.settings.savefig = buffer_ def create_html_manova(self, onweb=True): """Create summary table with all significant hits :param onweb: open browser with the created HTML page. """ df = self.get_significant_set() page = HTMLPageMANOVA(self, df, self.company) page.create_report(onweb) def create_html_pages(self, onweb=True): """Create all HTML pages""" self.create_html_main(onweb=onweb) self.create_html_manova(onweb=False) self.create_html_drugs() self.create_html_features() self.create_html_associations() def onweb(self): from easydev import onweb onweb(self.settings.directory + os.sep + 'index.html')
def add_settings(self): # -------------------------------------- settings and INPUT files input_dir = self.directory + os.sep + 'INPUT' filename = 'ANOVA_input.csv' filename = os.sep.join([input_dir, filename]) self.report.gdsc.ic50.to_csv(filename) filename = os.sep.join(['INPUT', 'ANOVA_input.csv']) self.jinja['ic50_file'] = filename # the genomic features, which may be the default version # one provided by the user. It may have been changed gf_filename = os.sep.join([input_dir, 'genomic_features.csv']) self.report.gdsc.features.to_csv(gf_filename) html = """Saved <a href="INPUT/genomic_features.csv">Genomic Features</a> file<br/> (possibly the default version).""" self.jinja['gf_file'] = html # Always save DRUG_DECODE file even if empty # It may be be interpreted in other pipeline or for reproducibility output_filename = input_dir + os.sep + 'DRUG_DECODE.csv' self.report.drug_decode.to_csv(output_filename) html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>' if len(self.report.drug_decode) == 0: html += 'Note that DRUG_DECODE file was not provided (empty?).' self.jinja['drug_decode'] = html # Save settings as json file filename = os.sep.join([input_dir, 'settings.json']) self.settings.to_json(filename) filename = os.path.basename(filename) self.jinja['settings'] = \ """Get the settings as a <a href="INPUT/%s"> json file</a>.""" % filename # Save all Results dataframe filename = os.sep.join( [self.settings.directory, 'OUTPUT', 'results.csv']) ANOVAResults(self.report.df).to_csv(filename) code = """from gdsctools import * import os def getfile(filename, where='../INPUT'): return os.sep.join([where, filename]) # reback the IC50 and genomic features matrices gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'), getfile('DRUG_DECODE.csv')) gdsc.settings.from_json(getfile('settings.json')) gdsc.init() # Analyse the data results = gdsc.anova_all() # Create the HTML report r = ANOVAReport(gdsc, results) r.create_html_pages(onweb=False)""" code = code % { 'ic50': 'ANOVA_input.csv', 'gf_filename': 'genomic_features.csv' } filename = os.sep.join([self.settings.directory, 'code', 'rerun.py']) fh = open(filename, 'w') fh.write(code) fh.close()
def create_data_packages_for_companies(self, companies=None): """Creates a data package for each company found in the DrugDecode file """ ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## # companies must be just one name (one string) or a list of strings # By default, takes all companies found in DrugDecode if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies if len(companies) == 0: raise ValueError("Could not find any companies in the DrugDecode file") # The main directory self.mkdir(self.company_directory) # Loop over all companies, retrieving information built # in analyse() method, selecting for each TCGA all information # for that company only (and public drugs) Ncomp = len(companies) for ii, company in enumerate(companies): print(purple("\n=========== Analysing company %s out of %s (%s)" % (ii+1, Ncomp, company))) self.mkdir(self.company_directory + os.sep + company) # Handle each TCGA case separately for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(brown(" ------- building TCGA %s sub directory" % tcga)) # Read the results previously computed either try: results_df = self.results[tcga].df.copy() except: results_path = "%s/%s/OUTPUT/results.csv" % (self.main_directory, tcga) results_df = ANOVAResults(results_path) # MAke sure the results are formatted correctly results = ANOVAResults(results_df) # Get the DrugDecode information for that company only drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # Filter the results to keep only public drugs and that # company. Make sure this is integers results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int) mask = [True if x in drug_decode_company.df.index else False for x in results.df.DRUG_ID] results.df = results.df.ix[mask] # We read the IC50 again try: self.ic50 = IC50(self.ic50_filename) except: self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) # And create an ANOVA instance. This is not to do the analyse # again but to hold various information an = ANOVA(self.ic50, gf_filename, drug_decode_company, verbose=False) def drug_to_keep(drug): to_keep = drug in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga an.settings.analysis_type = tcga # Now we create the report self.report = ANOVAReport(an, results, drug_decode=drug_decode_company, verbose=self.verbose) self.report.company = company self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) self.report.create_html_features() self.report.create_html_drugs() self.report.create_html_associations()
def anova_one_drug(self, drug_id, animate=True, output='object'): """Computes ANOVA for a given drug across all features :param str drug_id: a valid drug identifier. :param animate: shows the progress bar :return: a dataframe Calls :meth:`anova_one_drug_one_feature` for each feature. """ # drop first and second columns that are made of strings # works under python2 but not python 3. Assume that the 2 first #columns are the sample name and tissue feature # Then, we keep only cases with at least 3 features. # MSI could be used but is not like in original R code. features = self.features.df.copy() # need to skip the FACTOR to keep only features shift = self.features.shift features = features[features.columns[shift:]] # FIXME what about features with less than 3 zeros ? mask = features.sum(axis=0) >= 3 # TODO: MSI, tissues, name must always be kept # selected_features = features[features.columns[mask]] # scan all features for a given drug assert drug_id in self.ic50.df.columns N = len(selected_features.columns) pb = Progress(N, 10) res = {} # for i, feature in enumerate(selected_features.columns): # production True, means we do not want to create a DataFrame # for each call to the anova_one_drug_one_feature function # Instead, we require dictionaries this = self.anova_one_drug_one_feature(drug_id, feature, production=True) if this['ANOVA_FEATURE_pval'] is not None: res[feature] = this if animate is True: pb.animate(i + 1) # if production is False: # df = pid.concat(res, ignore_index=True) df = pd.DataFrame.from_records(res) df = df.T df = ANOVAResults().astype(df) if len(df) == 0: return df # append DRUG_NAME/DRUG_TARGET columns df = self.drug_decode.drug_annotations(df) # TODO: drop rows where ANOVA_FEATURE_PVAL is None if output != 'object': df = self.add_pvalues_correction(df) return df else: df = self.add_pvalues_correction(df) res = ANOVAResults(df, self.settings) res.settings = ANOVASettings(**self.settings) return res
def _create_summary_pages(self, main_directory, verbose=True, company=None): # Read all directories in tissue_packages directories = glob.glob(main_directory + os.sep + '*') summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[-1] if tcga not in self.tcga: continue if verbose: print(directory, tcga) # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = hits['Unnamed: 0'].unique() results = ANOVAResults(path + 'results.csv') if len(results) > 0: drug_ids = results.df.DRUG_ID.unique() else: drug_ids = [] # where to find the DRUG DECODE file. Should # have been copied path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([ tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public'] ]) df = pd.DataFrame(summary) df.columns = [ 'Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of' ] try: df.sort_values(by="Number of hits", ascending=False, inplace=True) except: df.sort("Number of hits", ascending=False, inplace=True) output_dir = main_directory + os.sep + '..' + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMain(directory=main_directory, filename='index.html', template_filename='datapack_summary.html', mode="summary") # Let us use our HTMLTable to add the HTML references self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') self.html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html( collapse_table=False) if company: self.html_page.jinja["collaborator"] = company self.html_page.write() return df
def create_data_packages_for_companies(self, companies=None): """Creates a data package for each company found in the DrugDecode file """ ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## # companies must be just one name (one string) or a list of strings # By default, takes all companies found in DrugDecode if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies if len(companies) == 0: raise ValueError( "Could not find any companies in the DrugDecode file") # The main directory self.mkdir(self.company_directory) # Loop over all companies, retrieving information built # in analyse() method, selecting for each TCGA all information # for that company only (and public drugs) Ncomp = len(companies) for ii, company in enumerate(companies): print( purple("\n=========== Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company))) self.mkdir(self.company_directory + os.sep + company) # Handle each TCGA case separately for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(brown(" ------- building TCGA %s sub directory" % tcga)) # Read the results previously computed either try: results_df = self.results[tcga].df.copy() except: results_path = "%s/%s/OUTPUT/results.csv" % ( self.main_directory, tcga) results_df = ANOVAResults(results_path) # MAke sure the results are formatted correctly results = ANOVAResults(results_df) # Get the DrugDecode information for that company only drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # Filter the results to keep only public drugs and that # company. Make sure this is integers results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int) mask = [ True if x in drug_decode_company.df.index else False for x in results.df.DRUG_ID ] results.df = results.df.ix[mask] # We read the IC50 again try: self.ic50 = IC50(self.ic50_filename) except: self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) # And create an ANOVA instance. This is not to do the analyse # again but to hold various information an = ANOVA(self.ic50, gf_filename, drug_decode_company, verbose=False) def drug_to_keep(drug): to_keep = drug in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga an.settings.analysis_type = tcga # Now we create the report self.report = ANOVAReport(an, results, drug_decode=drug_decode_company, verbose=self.verbose) self.report.company = company self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) self.report.create_html_features() self.report.create_html_drugs() self.report.create_html_associations()
def anova_all(self, animate=True, drugs=None, multicore=None): """Run all ANOVA tests for all drugs and all features. :param drugs: you may select a subset of drugs :param animate: shows the progress bar :return: an :class:`~gdsctools.anova_results.ANOVAResults` instance with the dataframe stored in an attribute called **df** Calls :meth:`anova_one_drug` for each drug and concatenate all results together. Note that once all data are gathered, :meth:`add_pvalues_correction` is called to fill a new column with FDR corrections. An extra column named "ASSOC_ID" is also added with a unique identifer sorted by ascending FDR. .. note:: A thorough comparison with version v17 gives the same FDR results (difference ~1e-6); Note however that the qvalue results differ by about 0.3% due to different smoothing in R and Python. """ if self.verbose and len(self.individual_anova): print("Reusing some results from the buffer. " "To reset the buffer, call reset_buffer() method") # drop DRUG where number of IC50 (non-null) is below 5 # axis=0 is default but we emphasize that sum is over # column (i.e. drug vv = (self.ic50.df.isnull() == False).sum(axis=0) # FIXME: should be in one_drug_one_feature ?? drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50] # if user provided a list of drugs, use them: if drugs is not None: # todo: check valifity of the drug names drug_names = drugs[:] pb = Progress(len(drug_names), 1) drug_names = list(drug_names) # pylab.shuffle(drug_names) # ? why if animate is True: pb.animate(0) if multicore: # Note that here, we do not use the buffer multicore_analysis(self, drug_names, multicore) else: for i, drug_name in enumerate(drug_names): if drug_name in self.individual_anova.keys(): pass else: res = self.anova_one_drug(drug_name, animate=False, output="dataframe") self.individual_anova[drug_name] = res if animate is True: pb.animate(i + 1) print("\n") if len(self.individual_anova) == 0: return ANOVAResults() df = pd.concat(self.individual_anova, ignore_index=True) if len(df) == 0: return df # sort all data by ANOVA p-values try: df.sort_values("ANOVA_FEATURE_pval", inplace=True) except: df.sort("ANOVA_FEATURE_pval", inplace=True) # all ANOVA have been computed individually for each drug and each # feature. Now, we need to compute the multiple testing corrections if self.settings.pvalue_correction_level == "global": df = self.add_pvalues_correction(df) else: pass # insert a unique identifier as first column df.insert(0, "ASSOC_ID", range(1, len(df) + 1)) self.df = df # order the column names as defined in the __init__ method df = df[self.column_names] df.reset_index(inplace=True, drop=True) results = ANOVAResults() results.df = df results.settings = ANOVASettings(**self.settings) return results
def anova_one_drug(self, drug_id, animate=True, output="object"): """Computes ANOVA for a given drug across all features :param str drug_id: a valid drug identifier. :param animate: shows the progress bar :return: a dataframe Calls :meth:`anova_one_drug_one_feature` for each feature. """ # drop first and second columns that are made of strings # works under python2 but not python 3. Assume that the 2 first # columns are the sample name and tissue feature # Then, we keep only cases with at least 3 features. # MSI could be used but is not like in original R code. features = self.features.df.copy() # need to skip the FACTOR to keep only features shift = self.features.shift features = features[features.columns[shift:]] # FIXME what about features with less than 3 zeros ? mask = features.sum(axis=0) >= 3 # TODO: MSI, tissues, name must always be kept # selected_features = features[features.columns[mask]] # scan all features for a given drug assert drug_id in self.ic50.df.columns N = len(selected_features.columns) pb = Progress(N, 10) res = {} # for i, feature in enumerate(selected_features.columns): # production True, means we do not want to create a DataFrame # for each call to the anova_one_drug_one_feature function # Instead, we require dictionaries this = self.anova_one_drug_one_feature(drug_id, feature, production=True) if this["ANOVA_FEATURE_pval"] is not None: res[feature] = this if animate is True: pb.animate(i + 1) # if production is False: # df = pid.concat(res, ignore_index=True) df = pd.DataFrame.from_records(res) df = df.T df = ANOVAResults().astype(df) if len(df) == 0: return df # append DRUG_NAME/DRUG_TARGET columns df = self.drug_decode.drug_annotations(df) # TODO: drop rows where ANOVA_FEATURE_PVAL is None if output != "object": df = self.add_pvalues_correction(df) return df else: df = self.add_pvalues_correction(df) res = ANOVAResults(df) res.settings = ANOVASettings(**self.settings) return res
def _create_report(self, onweb=True): # A summary table diag = self.report.diagnostics() table = HTMLTable(diag, 'summary') txt = '' for index, row in diag.iterrows(): if len(row.text) == 0 and len(row.value) == 0: txt += '----<br/>' else: txt += row.text + ": " + str(row.value) + "<br/>" self.jinja['summary'] = txt print('Creating volcano plots') # this can be pretty slow. so keep only 1000 most relevant # values and 1000 random ones to get an idea of the distribution v = VolcanoANOVA(self.report.df, settings=self.settings) v.selector(v.df, 1500, 1500, inplace=True) v.volcano_plot_all() v.savefig_and_js("volcano_all_js") self.jinja['volcano'] = """ <h3></h3> <a href="volcano_all_js.html"> <img alt="volcano plot for all associations" src="volcano_all_js.png"> </a> <br/> <p> A javascript version is available <a href="volcano_all_js.html">here</a> ( or click on the image).</p> """ # MANOVA link N = len(self.report.get_significant_set()) self.jinja['manova'] = """ There were %(N)s significant associations found. All significant associations have been gatherered in the following link: <br/><a href="manova.html">manova results</a>. """ % { 'N': N } # feature summary df_features = self.report.feature_summary("feature_summary.png") filename = 'OUTPUT' + os.sep + 'features_summary.csv' df_features.to_csv(self.directory + os.sep + filename, sep=',') # drug summary #not_tested = [x for x in self.report.gdsc.drugIds if x not in # self.report.df.DRUG_ID.unique()] #if len(not_tested) > 0: # not_tested = """%s drugs were not analysed due to # lack of valid data points: """ % len(not_tested) + \ # ", ".join(not_tested) #else: # not_tested = "" not_tested = "" self.jinja['drug_not_tested'] = not_tested df_drugs = self.report.drug_summary(filename="drug_summary.png") get_name = self.report.drug_decode.get_name if len(self.report.drug_decode.df) > 0: df_drugs.index = [x + "-" + get_name(x) for x in df_drugs.index] filename = 'OUTPUT' + os.sep + 'drugs_summary.csv' df_drugs.to_csv(self.directory + os.sep + filename, sep=',') # --------------------------- Create table with links to all drugs groups = self.report.df.groupby('DRUG_ID') try: df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values() except: # note double brackets for pythonn3.3 df = groups.mean()[['ANOVA_FEATURE_FDR']].sort() df = df.reset_index() # get back the Drug id in the dframe columns # let us add also the drug name df = self.report.drug_decode.drug_annotations(df) # let us also add number of associations computed counts = [len(groups.groups[k]) for k in df.DRUG_ID] df['Number of associations computed'] = counts groups = self.report.get_significant_set().groupby('DRUG_ID').groups count = [] for drug in df['DRUG_ID'].values: if drug in groups.keys(): count.append(len(groups[drug])) else: count.append(0) df['hits'] = count # add another set of drug_id but sorted in alpha numerical order table = HTMLTable(df, 'drugs') table.add_href('DRUG_ID') table.df.columns = [ x.replace('ANOVA_FEATURE_FDR', 'mean FEATURE ANOVA FDR') for x in table.df.columns ] table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['drug_table'] = table.to_html(escape=False, header=True, index=False) # ---------------------- Create full table with links to all features df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()}) try: df.sort_values(by='FEATURE', inplace=True) except: df.sort('FEATURE', inplace=True) groups = self.report.get_significant_set().groupby('FEATURE').groups count = [] for feature in df['FEATURE'].values: if feature in groups.keys(): count.append(len(groups[feature])) else: count.append(0) df['hits'] = count table = HTMLTable(df, 'features') table.sort('hits', ascending=False) table.add_href('FEATURE') table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['feature_table'] = table.to_html(escape=False, header=True, index=False) # -------------------------------------- COSMIC table for completeness colnames = self.report.gdsc.features._special_names df = self.report.gdsc.features.df[colnames] # TODO # add other columns if possible e.g., GDSC1, GDSC2, TCGA df = df.reset_index() table = HTMLTable(df) url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id=" table.add_href('COSMIC_ID', url=url, newtab=True) self.jinja['cosmic_table'] = table.to_html() # -------------------------------------- settings and INPUT files input_dir = self.directory + os.sep + 'INPUT' filename = 'ANOVA_input.csv' filename = os.sep.join([input_dir, filename]) self.report.gdsc.ic50.to_csv(filename) filename = os.sep.join(['INPUT', 'ANOVA_input.csv']) self.jinja['ic50_file'] = filename # the genomic features, which may be the default version # one provided by the user. It may have been changed gf_filename = os.sep.join([input_dir, 'genomic_features.csv']) self.report.gdsc.features.to_csv(gf_filename) html = """Saved <a href="INPUT/genomic_features.csv">Genomic Features</a> file<br/> (possibly the default version).""" self.jinja['gf_file'] = html # Always save DRUG_DECODE file even if empty # It may be be interpreted in other pipeline or for reproducibility output_filename = input_dir + os.sep + 'DRUG_DECODE.csv' self.report.drug_decode.to_csv(output_filename) html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>' if len(self.report.drug_decode) == 0: html += 'Note that DRUG_DECODE file was not provided (empty?).' self.jinja['drug_decode'] = html # Save settings as json file filename = os.sep.join([input_dir, 'settings.json']) self.settings.to_json(filename) filename = os.path.basename(filename) self.jinja['settings'] = \ """Get the settings as a <a href="INPUT/%s"> json file</a>.""" % filename # Save all Results dataframe filename = os.sep.join( [self.settings.directory, 'OUTPUT', 'results.csv']) ANOVAResults(self.report.df).to_csv(filename) code = """from gdsctools import * import os def getfile(filename, where='../INPUT'): return os.sep.join([where, filename]) # reback the IC50 and genomic features matrices gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'), getfile('DRUG_DECODE.csv')) gdsc.settings.from_json(getfile('settings.json')) gdsc.init() # Analyse the data results = gdsc.anova_all() # Create the HTML report r = ANOVAReport(gdsc, results) r.create_html_pages(onweb=False)""" code = code % { 'ic50': 'ANOVA_input.csv', 'gf_filename': 'genomic_features.csv' } filename = os.sep.join([self.settings.directory, 'code', 'rerun.py']) fh = open(filename, 'w') fh.write(code) fh.close()
def anova_all(self, animate=True, drugs=None): """Run all ANOVA tests for all drugs and all features. :param drugs: you may select a subset of drugs :param animate: shows the progress bar :return: an :class:`~gdsctools.anova_results.ANOVAResults` instance with the dataframe stored in an attribute called **df** Loops over all drugs calling :meth:`anova_one_drug` for each drug and concatenating all results together. Note that once all data are gathered, an extra column containing the FDR corrections is added to the dataframe using :meth:`add_pvalues_correction` method. An extra column named "ASSOC_ID" is also added with a unique identifer sorted by ascending FDR. .. note:: A thorough comparison with version v17 give the same FDR results (difference ~1e-6); Note however that the qvalue results differ by about 0.3% due to different smoothing in R and Python. """ # drop DRUG where number of IC50 (non-null) is below 5 # axis=0 is default but we emphasize that sum is over # column (i.e. drug vv = (self.ic50.df.isnull() == False).sum(axis=0) # FIXME: should be in one_drug_one_feature ?? drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50] # if user provided a list of drugs, use them: if drugs is not None: # todo: check valifity of the drug names drug_names = drugs[:] pb = Progress(len(drug_names), 1) drug_names = list(drug_names) pylab.shuffle(drug_names) if animate is True: pb.animate(0) for i, drug_name in enumerate(drug_names): if drug_name in self.individual_anova.keys(): pass else: res = self.anova_one_drug(drug_name, animate=False, output='dataframe') self.individual_anova[drug_name] = res if animate is True: pb.animate(i + 1) print("\n") if len(self.individual_anova) == 0: return ANOVAResults() df = pd.concat(self.individual_anova, ignore_index=True) if len(df) == 0: return df # sort all data by ANOVA p-values try: df.sort_values('ANOVA_FEATURE_pval', inplace=True) except: df.sort('ANOVA_FEATURE_pval', inplace=True) # all ANOVA have been computed individually for each drug and each # feature. Now, we need to compute the multiple testing corrections if self.settings.pvalue_correction_level == 'global': df = self.add_pvalues_correction(df) # insert a unique identifier as first column df.insert(0, 'ASSOC_ID', range(1, len(df) + 1)) self.df = df # order the column names as defined in the __init__ method df = df[self.column_names] df.reset_index(inplace=True, drop=True) results = ANOVAResults() results.df = df results.settings = ANOVASettings(**self.settings) return results
def __init__(self, ic50, genomic_features=None, drug_decode=None, verbose=True, low_memory=True, set_media_factor=False): """.. rubric:: Constructor :param DataFrame IC50: a dataframe with the IC50. Rows should be the COSMIC identifiers and columns should be the Drug names (or identifiers) :param features: another dataframe with rows as in the IC50 matrix and columns as features. The first 3 columns must be named specifically to hold tissues, MSI (see format). :param drug_decode: a 3 column CSV file with drug's name and targets see :mod:`readers` for more information. :param verbose: verbosity in "WARNING", "ERROR", "DEBUG", "INFO" The attribute :attr:`settings` contains specific settings related to the analysis or visulation. """ self.verbose = verbose self._init_called = False # We first need to read the IC50 using a dedicated reader self.ic50 = readers.IC50(ic50) # Create a dictionary version of the data # to be accessed per drug where NA have already been # removed. Each drug is a dictionary with 2 keys: # Y for the data and indices for the cosmicID where # there is an IC50 measured. ic50_parse = self.ic50.df.copy().unstack().dropna() self.ic50_dict = dict([(d, {'indices': ic50_parse.ix[d].index, 'Y':ic50_parse.ix[d].values}) for d in self.ic50.drugIds]) # Reads features if provided, otherwise use a default data set if genomic_features is None: # Reads default version provided with the package self.features = readers.GenomicFeatures() else: self.features = readers.GenomicFeatures(genomic_features) if self.features.found_media is False and \ set_media_factor is True: if self.verbose: print('Populating MEDIA Factor in the Genomic Feature matrix') self.features.fill_media_factor() #: a CSV with 3 columns used in the report self.read_drug_decode(drug_decode) # create the multiple testing factory used in anova_all() self.multiple_testing = MultipleTesting() # We prune the genomic features by settings the cosmic ids of # the features to be those of the cosmic ids of the IC50. See # readers module. This affectation, prune the features dataframe # automatically. This fails if a cosmic identifier is not # found in the features' cosmic ids, so let us catch the error # before hand to give a unknowns = set(self.ic50.cosmicIds).difference( set(self.features.cosmicIds)) if len(unknowns) > 0: print("WARNING: " + "%s cosmic identifiers in your IC50 " % len(unknowns) + "could not be found in the genomic feature matrix. "+ "They will be dropped. Consider using a user-defined " + "genomic features matrix") self.ic50.drop_cosmic(list(unknowns)) self.features.cosmicIds = self.ic50.cosmicIds #self.cosmicIds = self.ic50.cosmicIds #: an instance of :class:`~gdsctools.settings.ANOVASettings` self.settings = ANOVASettings() self.settings.low_memory = low_memory # alias to all column names to store results # cast to list (Python3). self.column_names = list(ANOVAResults().mapping.keys()) # skip assoc_id for now self._odof_dict = dict([(name, None) for name in self.column_names]) # a cache to store ANOVA results for each drug self.individual_anova = {} # must be called if ic50 or features are changed. self.init()
class ANOVAReport(object): """Class used to interpret the results and create final HTML report Results is a data structure returned by :meth:`ANOVA.anova_all`. :: from gdsctools import * # Perform the analysis itself to get a set of results (dataframe) an = ANOVA(ic50_test) results = an.anova_all() # now, we can create the report. r = ANOVAReport(gdsc=an, results=results) # we can tune some settings r.settings.pvalue_threshold = 0.001 r.settings.FDR_threshold = 28 r.settings.directory = 'testing' r.create_html_pages() .. rubric:: Significant association An association is significant if - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is below 0, otherwise it is **resistant**. """ def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf:0, -np.inf:0}) except: pass # create some data self._set_sensible_df() self.company = None # just to create the directory # ReportMain(directory=self.settings.directory, verbose=self.verbose) def _get_ndrugs(self): return len(self.df[self._colname_drug_id].unique()) n_drugs = property(_get_ndrugs, doc="return number of drugs") def _get_ntests(self): return len(self.df.index) n_tests = property(_get_ntests) def _get_ncelllines(self): return len(self.gdsc.features.df.index) n_celllines = property(_get_ncelllines, doc="return number of cell lines") def _df_append(self, df, data): count = len(df) df.ix[count] = data return df def diagnostics(self): """Return summary of the analysis (dataframe)""" self._set_sensible_df() df = pd.DataFrame({'text': [], 'value': []}) n_features = len(self.gdsc.features.df.columns) n_features -= self.gdsc.features.shift n_drugs = len(self.df[self._colname_drug_id].unique()) N = float(n_drugs * n_features) if N == 0: ratio = 0 else: ratio = float(self.n_tests)/(N) * 100 try: ratio = easydev.precision(ratio, digit=2) except: # Fixme: this is a hack for the infinite values but should not # happen... ratio = 0 msg = "Type of analysis" df = self._df_append(df, [msg, self.settings.analysis_type]) msg = "Total number of possible drug/feature associations" df = self._df_append(df, [msg, int(N)]) msg = "Total number of ANOVA tests performed" df = self._df_append(df, [msg, self.n_tests]) msg = "Percentage of tests performed" df = self._df_append(df, [msg, ratio]) # trick to have an empty line df = self._df_append(df, ["", ""]) msg = "Total number of tested drugs" df = self._df_append(df, [msg, n_drugs]) msg = "Total number of genomic features used" df = self._df_append(df, [msg, n_features]) msg = "Total number of screened cell lines" df = self._df_append(df, [msg, self.n_celllines]) msg = "MicroSatellite instability included as factor" msi = self.settings.include_MSI_factor df = self._df_append(df, [msg, msi]) # trick to have an empty line df = self._df_append(df, ["", ""]) nsens = len(self.sensible_df) nres = len(self.resistant_df) msg = "Total number of significant associations" df = self._df_append(df, [msg, nsens+nres]) msg = " - sensitive" df = self._df_append(df, [msg, nsens]) msg = " - resistant" df = self._df_append(df, [msg, nres]) msg = "p-value significance threshold" df = self._df_append(df, [msg, self.settings.pvalue_threshold]) msg = "FDR significance threshold" df = self._df_append(df, [msg, self.settings.FDR_threshold]) p1, p2 = self._get_pval_range() msg = 'Range of significant p-values' value = "[{:.4}, {:.4}]".format(p1, p2) df = self._df_append(df, [msg, value]) f1, f2 = self._get_fdr_range() msg = "Range of significant % FDRs" value = '[{:.4} {:.4}]'.format(f1, f2) df = self._df_append(df, [msg, value]) return df def _get_pval_range(self): """Get pvalues range of the significant hits""" nsens = len(self.sensible_df) nres = len(self.resistant_df) N = nsens + nres if N == 0: return 0., 0. name = self.varname_pval data = self.df[name].ix[0:N-1] m, M = data.min(), data.max() return m, M def _get_fdr_range(self): """Get FDR range of the significant hits""" name = self.varname_qval data = self.df[name][(self.df[name] < self.settings.FDR_threshold)] if len(data) == 0: return 0., 0. m, M = data.min(), data.max() return m, M def _set_sensible_df(self): # just an alias logand = np.logical_and # select sensible data set mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0 self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)] # select resistant data set mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0 self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)] def get_significant_set(self): """Return significant hits (resistant and sensible)""" # a property that is long to compute # and may change if FDR changes. self._set_sensible_df() df = pd.concat([self.sensible_df, self.resistant_df]) try: df.sort_values('ASSOC_ID', inplace=True) except: df.sort('ASSOC_ID', inplace=True) return df def _get_data(self, df_count_sensible, df_count_resistant): # we can drop all columns except one, which is renamed as count df1 = df_count_sensible['ASSOC_ID'] df1.name = 'sens assoc' df2 = df_count_resistant['ASSOC_ID'] df2.name = 'res assoc' # Now, we join the two TimeSeries (note that above, we selected only # one column so the dataframe was downcast to time series) df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer') # and set NA to zero df_count.fillna(0, inplace=True) # let us add a new column with the total df_count['total'] = df_count['sens assoc'] + df_count['res assoc'] # we want to sort by 'total' column and is equality by the name, # which is the index. So let us add the index temporarily as # a column, sort, and remove 'name' column afterwards df_count['name'] = df_count.index try: df_count.sort_values(by=['total', 'name'], ascending=[False, True], inplace=True) except: df_count.sort(columns=['total', 'name'], ascending=[False, True], inplace=True) df_count.drop('name', axis=1, inplace=True) return df_count def get_drug_summary_data(self): """Return dataframe with drug summary""" # get sensible and resistant sub dataframes self._set_sensible_df() # group by drug colname = self._colname_drug_id df_count_sensible = self.sensible_df.groupby(colname).count() df_count_resistant = self.resistant_df.groupby(colname).count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def drug_summary(self, top=50, fontsize=15, filename=None): """Return dataframe with significant drugs and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_drug_summary_data() if len(df_count): self._plot(df_count, 'drug', top) fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, size_inches=(12, 14), bbox_inches='tight') return df_count def get_feature_summary_data(self): """Return dataframe with feature summary""" # get sensible and resistant sub dataframes self._set_sensible_df() df_count_sensible = self.sensible_df.groupby('FEATURE').count() df_count_resistant = self.resistant_df.groupby('FEATURE').count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count def feature_summary(self, filename=None, top=50, fontsize=15): """Return dataframe with significant features and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_feature_summary_data() if len(df_count) > 0: self._plot(df_count, 'feature', top) #fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, set_inches=(12, 14), bbox_inches='tight') return df_count def _plot(self, df_count, title_tag, top): """Used by drug_summary and feature_summary to plot the bar plot""" if top > len(df_count): top = len(df_count) df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']] labels = list(df.index) # add drug name if len(self.drug_decode) > 0: for i, label in enumerate(labels): if title_tag == 'drug': name = self.drug_decode.get_name(label) if name is not None: labels[i] = "{}-{}".format(labels[i], name) else: pass labels = [str(x).replace('_', ' ') for x in labels] # restrict size to first 30 characters labels = [x[0:30] for x in labels] ind = range(0, len(labels)) # reverse does not exist with python3 try: ind.reverse() except: ind = list(ind) ind.reverse() data1 = df['sens assoc'].values data2 = df['res assoc'].values pylab.figure(1) pylab.clf() p1 = pylab.barh(ind, data1, height=0.8, color='purple', label='sensitivity') p2 = pylab.barh(ind, data2, height=0.8, color='orange', left=data1, label='resistance') ax = pylab.gca() self.labels = labels ax.set_yticks([x + 0.5 for x in ind]) ax.set_yticklabels(labels, fontsize=12) xticks = ax.get_xticks() ax.set_xticklabels( [int(x) if divmod(x,1)[1] == 0 else "" for x in xticks]) pylab.grid() pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \ "\nassociated with drug response", fontsize=self.settings.fontsize/1.2) pylab.xlabel(r'Number of significant associations (FDR %s %s %s) ' % ("$>$", self.settings.FDR_threshold, "$\%$"), fontsize=18) M = max(data1+data2) #ax.set_xticks() #ax.set_xticklabels(labels, fontsize=fontsize) ax.set_xlim([0, M+1]) pylab.legend(loc='lower right') try:pylab.tight_layout() except:pass def get_significant_hits(self, show=True): """Return a summary of significant hits :param show: show a plot with the distribution of significant hits .. todo:: to finalise """ fdrs = range(5, 50+1, 5) significants = [] significant_meaningful = [] strong_hits = [] full_strong_hits = [] MC1 = 1 MC2 = 2 mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1 mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2 mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1 mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2 maskMC = mask2 + mask3 + mask4 + mask5 for fdr in fdrs: # significant hits res = self.df['ANOVA_FEATURE_FDR'] < fdr significants.append(res.sum()) # meaningful hits indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr, maskMC) significant_meaningful.append(indices.sum()) # meaningful strong hits mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1 strong_hits.append(np.logical_or(mask1, mask2).sum()) # meaningful full strong hits mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1 full_strong_hits.append(np.logical_and(mask1, mask2).sum()) data = {'significants': significants, 'full_strong_hits': full_strong_hits, 'strong_hits': strong_hits, 'significant_meaningful': significant_meaningful} df = pd.DataFrame(data, columns = ['significants', 'significant_meaningful', 'strong_hits', 'full_strong_hits'], index=fdrs) df.columns = ['1) significant', '2) 1 + meaningful', '3) 2 + strong', '4) 2+ very strong'] if show is True: pylab.clf() ax = pylab.gca() df.plot(kind='bar', width=.8, color=['r', 'gray', 'orange', 'black'], rot=0, ax=ax) pylab.grid() # original is 'aquamarine4','cyan2','cornflowerblue ','aquamarine'), return df def __str__(self): self.df.info() return "" def create_html_associations(self): """Create an HTML page for each significant association The name of the output HTML file is **<association id>.html** where association id is stored in :attr:`df`. """ if self.verbose: print("Creating individual HTML pages for each significant association") df = self.get_significant_set() drugs = df['DRUG_ID'].values features = df['FEATURE'].values assocs = df['ASSOC_ID'].values fdrs = df['ANOVA_FEATURE_FDR'].values N = len(df) pb = Progress(N) html = Association(self, drug='dummy', feature='dummy', fdr='dummy', company=self.company) for i in range(N): html.drug = drugs[i] html.feature = features[i] if str(assocs[i]).startswith("a"): html._filename = str(assocs[i]) + '.html' else: html._filename = "a" + str(assocs[i]) + '.html' html.fdr = fdrs[i] html.assoc_id = assocs[i] #html._init_report() # since we have one shared instance html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n") def create_html_features(self): """Create an HTML page for each significant feature""" df = self.get_significant_set() groups = df.groupby('FEATURE') if self.verbose: print("Creating individual HTML pages for each feature") N = len(groups.indices.keys()) pb = Progress(N) for i, feature in enumerate(groups.indices.keys()): # get the indices and therefore subgroup subdf = groups.get_group(feature) html = HTMLOneFeature(self, self.df, subdf, feature) html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n") def create_html_drugs(self): """Create an HTML page for each drug""" # group by drugs all_drugs = list(self.df['DRUG_ID'].unique()) df = self.get_significant_set() groups = df.groupby('DRUG_ID') if self.verbose: print("Creating individual HTML pages for each drug") N = len(groups.indices.keys()) N = len(all_drugs) pb = Progress(N) for i, drug in enumerate(all_drugs): # enumerate(groups.indices.keys()): # get the indices and therefore subgroup if drug in groups.groups.keys(): subdf = groups.get_group(drug) else: subdf = {} html = HTMLOneDrug(self, self.df, subdf, drug) html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n") def create_html_main(self, onweb=False): """Create HTML main document (summary)""" self._set_sensible_df() if self.verbose: print("Creating main HTML page in directory %s" % (self.settings.directory)) ReportMain(directory=self.settings.directory, verbose=self.verbose) buffer_ = self.settings.savefig self.settings.savefig = True html = HTMLPageMain(self, 'index.html') html._init_report() # created the directory html.create_report(onweb=onweb) self.settings.savefig = buffer_ def create_html_manova(self, onweb=True): """Create summary table with all significant hits :param onweb: open browser with the created HTML page. """ df = self.get_significant_set() page = HTMLPageMANOVA(self, df, self.company) page.create_report(onweb) def create_html_pages(self, onweb=True): """Create all HTML pages""" self.create_html_main(onweb=onweb) self.create_html_manova(onweb=False) self.create_html_drugs() self.create_html_features() self.create_html_associations() def onweb(self): from easydev import onweb onweb(self.settings.directory + os.sep + 'index.html')