예제 #1
0
    def _create_report(self, onweb=True):
        # generated pictures and results
        df = self.run()

        # Create the table and add it
        sign = ANOVAResults(df)

        html_table = sign.get_html_table(escape=False, header=True,
                index=False)

        self.jinja['association_table'] = html_table

        # Main boxplot always included
        prefix = 'images/ODOF_all'
        tag = "{0}_{1}____{2}.png".format(prefix, self.drug, self.feature)
        section = '<img alt="association {0}" src="{0}">\n'.format(tag)

        if self.factory.settings.include_MSI_factor:
            prefix = 'ODOF_msi'
            tag = "{0}_{1}____{2}.png".format(prefix, self.drug, self.feature)
            section += '<img alt="association {0}" src="{0}">\n'.format(tag)
        if self.factory.settings.analysis_type == 'PANCAN':
            prefix = 'ODOF_tissue'
            tag = "{0}_{1}____{2}.png".format(prefix, self.drug, self.feature)
            section += '<img alt="association {0}" src="{0}">\n'.format(tag)
        self.jinja['boxplots'] = section
예제 #2
0
    def _create_report(self, onweb=True):
        # generated pictures and results
        df = self.run()

        # Create the table and add it
        sign = ANOVAResults(df)

        html_table = sign.get_html_table(escape=False,
                                         header=True,
                                         index=False)

        self.jinja['association_table'] = html_table

        # Main boxplot always included
        prefix = 'images/ODOF_all'
        tag = "{0}_{1}____{2}.png".format(prefix, self.drug, self.feature)
        section = '<img alt="association {0}" src="{0}">\n'.format(tag)

        if self.factory.settings.include_MSI_factor:
            prefix = 'ODOF_msi'
            tag = "{0}_{1}____{2}.png".format(prefix, self.drug, self.feature)
            section += '<img alt="association {0}" src="{0}">\n'.format(tag)
        if self.factory.settings.analysis_type == 'PANCAN':
            prefix = 'ODOF_tissue'
            tag = "{0}_{1}____{2}.png".format(prefix, self.drug, self.feature)
            section += '<img alt="association {0}" src="{0}">\n'.format(tag)
        self.jinja['boxplots'] = section
예제 #3
0
 def _create_report(self, onweb=True):
     self.jinja['N_hits'] = len(self.subdf)
     if len(self.subdf) > 0:
         sign = ANOVAResults(self.subdf)
         html = sign.get_html_table(escape=False, header=True, index=False)
         self.jinja['association_table'] = html
     self.jinja['volcano_jsdata'] = self.create_pictures()
예제 #4
0
    def _create_report(self, onweb=True):
        # generated pictures and results
        df = self.run()
        odof = self.factory._get_one_drug_one_feature_data(
            self.drug, self.feature)

        # Create the table and add it
        sign = ANOVAResults(df)

        html_table = sign.get_html_table(escape=False,
                                         header=True,
                                         index=False,
                                         add_href=self.add_href)

        self.jinja['association_table'] = html_table

        # Javascript version
        bx = BoxPlotsJS(odof)
        self.jinja["boxplot_all_jsdata"] = bx.get_html_association()

        section = ""
        if self.factory.settings.include_MSI_factor:
            self.jinja["boxplot_msi_jsdata"] = bx.get_html_msi()
        if self.factory.settings.include_media_factor:
            self.jinja["boxplot_media_jsdata"] = bx.get_html_media()
        if self.factory.settings.analysis_type == 'PANCAN':
            self.jinja["boxplot_tissue_jsdata"] = bx.get_html_tissue()

        self.jinja['boxplots'] = section
예제 #5
0
    def _create_report(self, onweb=True):
        # generated pictures and results
        df = self.run()
        odof = self.factory._get_one_drug_one_feature_data(self.drug, 
                self.feature)

        # Create the table and add it
        sign = ANOVAResults(df)

        html_table = sign.get_html_table(escape=False, header=True,
                index=False)

        self.jinja['association_table'] = html_table

        # Javascript version
        bx = BoxPlotsJS(odof)
        self.jinja["boxplot_all_jsdata"] = bx.get_html_association()

        section = ""
        if self.factory.settings.include_MSI_factor:
            self.jinja["boxplot_msi_jsdata"] = bx.get_html_msi()
        if self.factory.settings.include_media_factor:
            self.jinja["boxplot_media_jsdata"] = bx.get_html_media()
        if self.factory.settings.analysis_type == 'PANCAN':
            self.jinja["boxplot_tissue_jsdata"] = bx.get_html_tissue()

        self.jinja['boxplots'] = section
예제 #6
0
 def _create_report(self, onweb=True):
     self.jinja['N_hits'] = len(self.subdf)
     if len(self.subdf) > 0:
         sign = ANOVAResults(self.subdf)
         html = sign.get_html_table(escape=False, 
                 header=True, index=False)
         self.jinja['association_table'] = html
     self.jinja['volcano_jsdata'] =  self.create_pictures()
예제 #7
0
    def __init__(self,
                 gdsc,
                 results=None,
                 sep="\t",
                 drug_decode=None,
                 verbose=True):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`. If
            not provided, the ANOVA is run on the fly.

        """
        self.verbose = verbose
        self.figtools = Savefig(verbose=False)
        self.gdsc = gdsc

        if results is None:
            results = gdsc.anova_all()
        self.df = ANOVAResults(results).df  # this does a copy and sanity check

        # Make sure the DRUG are integers
        self.df.DRUG_ID = self.df.DRUG_ID.astype(int)

        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)
        #if sum(self.df == np.inf).sum()>0:
        #    print("WARNING: infinite values were found in your results... Set to zero")
        try:
            self.df = self.df.replace({np.inf: 0, -np.inf: 0})
        except:
            pass

        # create some data
        self._set_sensible_df()

        self.company = None
예제 #8
0
    def _create_report(self, onweb=True):
        self.create_pictures()

        self.jinja['N_hits'] = len(self.subdf)
        if len(self.subdf) > 0:
            sign = ANOVAResults(self.subdf)
            html = sign.get_html_table(escape=False, 
                    header=True, index=False)
            self.jinja['association_table'] = html

        # image section
        self.jinja['image_filename'] = "images/volcano_{0}".format(self.feature)
예제 #9
0
    def _create_report(self, onweb=True):
        self.create_pictures()

        self.jinja['N_hits'] = len(self.subdf)
        if len(self.subdf) > 0:
            sign = ANOVAResults(self.subdf)
            html = sign.get_html_table(escape=False, header=True, index=False)
            self.jinja['association_table'] = html

        # image section
        self.jinja['image_filename'] = "images/volcano_{0}".format(
            self.feature)
예제 #10
0
 def load_results(self):
     """Find the files results.csv in all TCGA directories"""
     for tcga in self.tcga:
         print(tcga)
         self.results[tcga] = ANOVAResults('ALL' + os.sep + tcga + os.sep +
                                           'OUTPUT' + os.sep +
                                           'results.csv')
예제 #11
0
    def _create_report(self, onweb=True):
        #self.create_pictures()

        # add the table
        self.jinja['synonyms'] = ''
        self.jinja['brand_name'] = ''
        self.jinja['conc_min'] = '?'
        self.jinja['conc_max'] = '?'

        # Table section
        self.jinja['N_hits'] = len(self.subdf)
        if len(self.subdf) > 0:
            sign = ANOVAResults(self.subdf)
            html = sign.get_html_table(escape=False, header=True, index=False)
            self.jinja['association_table'] = html

        # image section
        self.jinja['volcano_jsdata'] = self.create_pictures()
예제 #12
0
    def _create_report(self, onweb=True):
        #self.create_pictures()

        # add the table
        self.jinja['synonyms'] = ''
        self.jinja['brand_name'] = ''
        self.jinja['conc_min'] = '?'
        self.jinja['conc_max'] = '?'

        # Table section
        self.jinja['N_hits'] = len(self.subdf)
        if len(self.subdf) > 0:
            sign = ANOVAResults(self.subdf)
            html = sign.get_html_table(escape=False, 
                    header=True, index=False)
            self.jinja['association_table'] = html

        # image section
        self.jinja['volcano_jsdata'] =  self.create_pictures()
예제 #13
0
    def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, 
            verbose=True):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`. If
            not provided, the ANOVA is run on the fly.

        """
        self.verbose = verbose
        self.figtools = Savefig(verbose=False)
        self.gdsc = gdsc

        if results is None:
            results = gdsc.anova_all()
        self.df = ANOVAResults(results).df # this does a copy and sanity check


        # Make sure the DRUG are integers
        self.df.DRUG_ID = self.df.DRUG_ID.astype(int)


        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)
        #if sum(self.df == np.inf).sum()>0:
        #    print("WARNING: infinite values were found in your results... Set to zero")
        try:
            self.df = self.df.replace({np.inf:0, -np.inf:0})
        except:
            pass

        # create some data
        self._set_sensible_df()

        self.company = None
예제 #14
0
    def __init__(self, gdsc, results, sep="\t", drug_decode=None):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`

        """
        self.figtools = Savefig()
        self.gdsc = gdsc
        self.df = ANOVAResults(results).df  # this does a copy and sanity check

        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)

        # create some data
        self._set_sensible_df()

        # just to create the directory
        ReportMAIN(directory=self.settings.directory)
예제 #15
0
    def __init__(self, gdsc, df):
        """.. rubric:: constructor

        :param : an ANOVA instance.
        :param directory: where to save the file

        The HTML filename is stored in the :attr:`filename`, which can
        be changes (default is manova.html)
        """
        super(HTMLPageMANOVA, self).__init__(filename='manova.html',
                                             directory=gdsc.settings.directory,
                                             template_filename='manova.html')

        html = ANOVAResults(df).get_html_table(collapse_table=False)

        self.jinja['manova'] = html
        self.jinja['analysis_domain'] = gdsc.settings.analysis_type
예제 #16
0
    def __init__(self, report, df, company):
        """.. rubric:: constructor

        :param : an ANOVA instance.
        :param directory: where to save the file

        The HTML filename is stored in the :attr:`filename`, which can
        be changes (default is manova.html)
        """
        super(HTMLPageMANOVA, self).__init__(
            filename='manova.html',
            directory=report.settings.directory + os.sep + "associations",
            template_filename='manova.html',
            init_report=False)

        html = ANOVAResults(df).get_html_table(collapse_table=False)

        self.jinja['manova'] = html
        self.jinja['analysis_domain'] = report.settings.analysis_type
        self.jinja['resource_path'] = ".."
        self.jinja["collaborator"] = company
예제 #17
0
    def __init__(self, gdsc, results, sep="\t", drug_decode=None):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`

        """
        self.figtools = Savefig()
        self.gdsc = gdsc
        self.df = ANOVAResults(results).df # this does a copy and sanity check

        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)

        # create some data
        self._set_sensible_df()

        # just to create the directory
        ReportMAIN(directory=self.settings.directory)
예제 #18
0
    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                  (ii + 1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in drug_ids_in_results
                ]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename,
                           drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from
                    # the analysis made in ALL
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep,
                                                        os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep, tcga,
                                                     os.sep, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        pb.animate(i + 1)
예제 #19
0
    def create_summary_pages(self, main_directory='ALL'):
        # Read in ALL all directories

        # create directories and copy relevant files
        self.mkdir(main_directory + os.sep + 'images')
        self.mkdir(main_directory + os.sep + 'css')
        self.mkdir(main_directory + os.sep + 'js')
        from gdsctools import gdsctools_data
        for filename in ['gdsc.css', 'github-gist.css']:
            target = os.sep.join([main_directory, 'css', filename])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['highlight.pack.js']:
            target = os.sep.join([main_directory, 'js', filename])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['EBI_logo.png', 'sanger-logo.png']:
            target = os.sep.join([main_directory, 'images', filename])
            if os.path.isfile(target) is False:
                dire = 'data' + os.sep + 'images'
                filename = gdsctools_data("images" + os.sep + filename)
                shutil.copy(filename, target)
        directories = glob.glob('ALL' + os.sep + '*')
        directories = [x for x in directories if os.path.isdir(x)]

        summary = []
        for directory in sorted(directories):

            tcga = directory.split(os.sep)[1]
            if tcga in ['css', 'images']:
                continue

            # number of hits
            path = directory + os.sep + 'OUTPUT' + os.sep
            try:
                hits = pd.read_csv(path + 'drugs_summary.csv', sep=',')
            except:
                summary.append([tcga] + [None] * 5)
                continue
            total_hits = hits.total.sum()

            drug_involved = get_drug_id(hits['Unnamed: 0'].unique())

            results = ANOVAResults(path + 'results.csv')
            if len(results) > 0:
                drug_ids = get_drug_id(results.df.DRUG_ID.unique())
            else:
                drug_ids = []

            path = directory + os.sep + 'INPUT' + os.sep
            drug_decode = DrugDecode(path + 'DRUG_DECODE.csv')
            info = drug_decode.get_info()

            webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE
            drug_inv_public = sum(webrelease == 'Y')
            drug_inv_prop = sum(webrelease != 'Y')

            summary.append([
                tcga, total_hits, drug_inv_prop, info['N_prop'],
                drug_inv_public, info['N_public']
            ])
        df = pd.DataFrame(summary)
        df.columns = [
            'Analysis name', 'Number of hits',
            'Number of involved proprietary compounds', 'out of',
            'Number of involved public', 'out of'
        ]

        # FIXME include css and images of logo
        # FIXME save in the proper directory
        output_dir = main_directory + os.sep + '..' + os.sep
        output_file = output_dir + os.sep + 'index.html'
        self.html_page = ReportMAIN(directory='ALL',
                                    filename='index.html',
                                    template_filename='datapack_summary.html')

        # Let us use our HTMLTable to add the HTML references
        from gdsctools.report import HTMLTable
        self.html_table = HTMLTable(df)
        self.html_table.add_href('Analysis name',
                                 newtab=True,
                                 url=None,
                                 suffix='/index.html')
        #html_table.add_bgcolor('Number of hits')

        self.html_page.jinja['data_table'] = self.html_table.to_html()
        self.html_page.write()

        return df
예제 #20
0
    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                    (ii+1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                        "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [True if x in drug_decode_company.df.index else False
                        for x in drug_ids_in_results]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep
                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) 

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from 
                    # the analysis made in ALL 
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep )
                        cmd = "cp %s%s %s" % (source, filename, dest )
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga,
                                os.sep, os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep,
                                tcga, os.sep , os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest )
                        shellcmd(cmd, verbose=False)
                        pb.animate(i+1)
예제 #21
0
class ANOVAReport(object):
    """Class used to interpret the results and create final HTML report

    Results is a data structure returned by :meth:`ANOVA.anova_all`.

    ::

        from gdsctools import *

        # Perform the analysis itself to get a set of results (dataframe)
        an = ANOVA(ic50_test)
        results = an.anova_all()

        # now, we can create the report.
        r = ANOVAReport(gdsc=an, results=results)

        # we can tune some settings
        r.settings.pvalue_threshold = 0.001
        r.settings.FDR_threshold = 28
        r.settings.directory = 'testing'
        r.create_html_pages()


    .. rubric:: Significant association

    An association is significant if

      - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold
      - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold

    It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is 
    below 0, otherwise it is **resistant**.


    """
    def __init__(self,
                 gdsc,
                 results=None,
                 sep="\t",
                 drug_decode=None,
                 verbose=True):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`. If
            not provided, the ANOVA is run on the fly.

        """
        self.verbose = verbose
        self.figtools = Savefig(verbose=False)
        self.gdsc = gdsc

        if results is None:
            results = gdsc.anova_all()
        self.df = ANOVAResults(results).df  # this does a copy and sanity check

        # Make sure the DRUG are integers
        self.df.DRUG_ID = self.df.DRUG_ID.astype(int)

        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)
        #if sum(self.df == np.inf).sum()>0:
        #    print("WARNING: infinite values were found in your results... Set to zero")
        try:
            self.df = self.df.replace({np.inf: 0, -np.inf: 0})
        except:
            pass

        # create some data
        self._set_sensible_df()

        self.company = None

        # just to create the directory
        # ReportMain(directory=self.settings.directory, verbose=self.verbose)

    def _get_ndrugs(self):
        return len(self.df[self._colname_drug_id].unique())

    n_drugs = property(_get_ndrugs, doc="return number of drugs")

    def _get_ntests(self):
        return len(self.df.index)

    n_tests = property(_get_ntests)

    def _get_ncelllines(self):
        return len(self.gdsc.features.df.index)

    n_celllines = property(_get_ncelllines, doc="return number of cell lines")

    def _df_append(self, df, data):
        count = len(df)
        df.loc[count] = data
        return df

    def diagnostics(self):
        """Return summary of the analysis (dataframe)"""
        self._set_sensible_df()

        df = pd.DataFrame({'text': [], 'value': []})

        n_features = len(self.gdsc.features.df.columns)
        n_features -= self.gdsc.features.shift
        n_drugs = len(self.df[self._colname_drug_id].unique())

        N = float(n_drugs * n_features)

        if N == 0:
            ratio = 0
        else:
            ratio = float(self.n_tests) / (N) * 100

        try:
            ratio = easydev.precision(ratio, digit=2)
        except:
            # Fixme: this is a hack for the infinite values but should not
            # happen...
            ratio = 0

        msg = "Type of analysis"
        df = self._df_append(df, [msg, self.settings.analysis_type])

        msg = "Total number of possible drug/feature associations"
        df = self._df_append(df, [msg, int(N)])
        msg = "Total number of ANOVA tests performed"
        df = self._df_append(df, [msg, self.n_tests])
        msg = "Percentage of tests performed"
        df = self._df_append(df, [msg, ratio])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])

        msg = "Total number of tested drugs"
        df = self._df_append(df, [msg, n_drugs])
        msg = "Total number of genomic features used"
        df = self._df_append(df, [msg, n_features])

        msg = "Total number of screened cell lines"
        df = self._df_append(df, [msg, self.n_celllines])

        msg = "MicroSatellite instability included as factor"
        msi = self.settings.include_MSI_factor
        df = self._df_append(df, [msg, msi])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        msg = "Total number of significant associations"
        df = self._df_append(df, [msg, nsens + nres])
        msg = " - sensitive"
        df = self._df_append(df, [msg, nsens])
        msg = " - resistant"
        df = self._df_append(df, [msg, nres])

        msg = "p-value significance threshold"
        df = self._df_append(df, [msg, self.settings.pvalue_threshold])

        msg = "FDR significance threshold"
        df = self._df_append(df, [msg, self.settings.FDR_threshold])

        p1, p2 = self._get_pval_range()
        msg = 'Range of significant p-values'
        value = "[{:.4}, {:.4}]".format(p1, p2)
        df = self._df_append(df, [msg, value])

        f1, f2 = self._get_fdr_range()
        msg = "Range of significant % FDRs"
        value = '[{:.4} {:.4}]'.format(f1, f2)
        df = self._df_append(df, [msg, value])
        return df

    def _get_pval_range(self):
        """Get pvalues range of the significant hits"""
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        N = nsens + nres
        if N == 0:
            return 0., 0.
        name = self.varname_pval
        data = self.df[name].iloc[0:N]
        m, M = data.min(), data.max()
        return m, M

    def _get_fdr_range(self):
        """Get FDR range of the significant hits"""
        name = self.varname_qval
        data = self.df[name][(self.df[name] < self.settings.FDR_threshold)]
        if len(data) == 0:
            return 0., 0.
        m, M = data.min(), data.max()
        return m, M

    def _set_sensible_df(self):
        # just an alias
        logand = np.logical_and

        # select sensible data set
        mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold
        mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold
        mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0
        self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)]

        # select resistant data set
        mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0
        self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)]

    def get_significant_set(self):
        """Return significant hits (resistant and sensible)"""
        # a property that is long to compute
        # and may change if FDR changes.
        self._set_sensible_df()
        df = pd.concat([self.sensible_df, self.resistant_df])
        try:
            df.sort_values('ASSOC_ID', inplace=True)
        except:
            df.sort('ASSOC_ID', inplace=True)
        return df

    def _get_data(self, df_count_sensible, df_count_resistant):
        # we can drop all columns except one, which is renamed as count
        df1 = df_count_sensible['ASSOC_ID']
        df1.name = 'sens assoc'
        df2 = df_count_resistant['ASSOC_ID']
        df2.name = 'res assoc'

        # Now, we join the two TimeSeries (note that above, we selected only
        # one column so the dataframe was downcast to time series)
        df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer')
        # and set NA to zero
        df_count.fillna(0, inplace=True)
        # let us add a new column with the total
        df_count['total'] = df_count['sens assoc'] + df_count['res assoc']

        # we want to sort by 'total' column and is equality by the name,
        # which is the index. So let us add the index temporarily as
        # a column, sort, and remove 'name' column afterwards
        df_count['name'] = df_count.index
        try:
            df_count.sort_values(by=['total', 'name'],
                                 ascending=[False, True],
                                 inplace=True)
        except:
            df_count.sort(columns=['total', 'name'],
                          ascending=[False, True],
                          inplace=True)

        df_count.drop('name', axis=1, inplace=True)
        return df_count

    def get_drug_summary_data(self):
        """Return dataframe with drug summary"""
        # get sensible and resistant sub dataframes
        self._set_sensible_df()

        # group by drug
        colname = self._colname_drug_id
        df_count_sensible = self.sensible_df.groupby(colname).count()
        df_count_resistant = self.resistant_df.groupby(colname).count()

        df_count = self._get_data(df_count_sensible, df_count_resistant)
        return df_count

    def drug_summary(self, top=50, fontsize=15, filename=None):
        """Return dataframe with significant drugs and plot figure

        :param fontsize:
        :param top: max number of significant associations to show
        :param filename: if provided, save the file in the directory
        """

        df_count = self.get_drug_summary_data()

        if len(df_count):
            self._plot(df_count, 'drug', top)
            fig = pylab.gcf()
            self.figtools.directory = self.settings.directory
            self.figtools.savefig(filename,
                                  size_inches=(12, 14),
                                  bbox_inches='tight')
        return df_count

    def get_feature_summary_data(self):
        """Return dataframe with feature summary"""
        # get sensible and resistant sub dataframes
        self._set_sensible_df()
        df_count_sensible = self.sensible_df.groupby('FEATURE').count()
        df_count_resistant = self.resistant_df.groupby('FEATURE').count()
        df_count = self._get_data(df_count_sensible, df_count_resistant)
        return df_count

    def feature_summary(self, filename=None, top=50, fontsize=15):
        """Return dataframe with significant features and plot figure

        :param fontsize:
        :param top: max number of significant associations to show
        :param filename: if provided, save the file in the directory
        """
        df_count = self.get_feature_summary_data()

        if len(df_count) > 0:
            self._plot(df_count, 'feature', top)
            #fig = pylab.gcf()
            self.figtools.directory = self.settings.directory
            self.figtools.savefig(filename,
                                  set_inches=(12, 14),
                                  bbox_inches='tight')
        return df_count

    def _plot(self, df_count, title_tag, top):
        """Used by drug_summary and feature_summary to plot the
        bar plot"""
        if top > len(df_count):
            top = len(df_count)

        df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']]
        labels = list(df.index)
        # add drug name
        if len(self.drug_decode) > 0:
            for i, label in enumerate(labels):
                if title_tag == 'drug':
                    name = self.drug_decode.get_name(label)
                    if name is not None:
                        labels[i] = "{}-{}".format(labels[i], name)
                else:
                    pass

        labels = [str(x).replace('_', ' ') for x in labels]
        # restrict size to first 30 characters
        labels = [x[0:30] for x in labels]
        ind = range(0, len(labels))
        # reverse does not exist with python3
        try:
            ind.reverse()
        except:
            ind = list(ind)
            ind.reverse()
        data1 = df['sens assoc'].values
        data2 = df['res assoc'].values
        pylab.figure(1)
        pylab.clf()
        p1 = pylab.barh(ind,
                        data1,
                        height=0.8,
                        color='purple',
                        label='sensitivity')
        p2 = pylab.barh(ind,
                        data2,
                        height=0.8,
                        color='orange',
                        left=data1,
                        label='resistance')
        ax = pylab.gca()
        self.labels = labels
        ax.set_yticks([x + 0.5 for x in ind])
        ax.set_yticklabels(labels, fontsize=12)

        xticks = ax.get_xticks()
        ax.set_xticklabels(
            [int(x) if divmod(x, 1)[1] == 0 else "" for x in xticks])

        pylab.grid()
        pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \
                    "\nassociated with drug  response",
                    fontsize=self.settings.fontsize/1.2)
        pylab.xlabel(r'Number of significant associations (FDR %s %s %s) ' %
                     ("$>$", self.settings.FDR_threshold, "$\%$"),
                     fontsize=16)

        M = max(data1 + data2)
        #ax.set_xticks()
        #ax.set_xticklabels(labels, fontsize=fontsize)
        ax.set_xlim([0, M + 1])
        pylab.legend(loc='lower right')
        try:
            pylab.tight_layout()
        except:
            pass

    def get_significant_hits(self, show=True):
        """Return a summary of significant hits

        :param show: show a plot with the distribution of significant hits

        .. todo:: to finalise
        """
        fdrs = range(5, 50 + 1, 5)

        significants = []
        significant_meaningful = []
        strong_hits = []
        full_strong_hits = []

        MC1 = 1
        MC2 = 2
        mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1
        mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2
        mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1
        mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2
        maskMC = mask2 + mask3 + mask4 + mask5

        for fdr in fdrs:
            # significant hits
            res = self.df['ANOVA_FEATURE_FDR'] < fdr
            significants.append(res.sum())

            # meaningful hits
            indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr,
                                     maskMC)
            significant_meaningful.append(indices.sum())

            # meaningful strong hits
            mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1
            mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1
            strong_hits.append(np.logical_or(mask1, mask2).sum())

            # meaningful full strong hits
            mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1
            mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1
            full_strong_hits.append(np.logical_and(mask1, mask2).sum())

        data = {
            'significants': significants,
            'full_strong_hits': full_strong_hits,
            'strong_hits': strong_hits,
            'significant_meaningful': significant_meaningful
        }

        df = pd.DataFrame(data,
                          columns=[
                              'significants', 'significant_meaningful',
                              'strong_hits', 'full_strong_hits'
                          ],
                          index=fdrs)
        df.columns = [
            '1) significant', '2) 1 + meaningful', '3) 2 + strong',
            '4) 2+ very strong'
        ]

        if show is True:
            pylab.clf()
            ax = pylab.gca()
            df.plot(kind='bar',
                    width=.8,
                    color=['r', 'gray', 'orange', 'black'],
                    rot=0,
                    ax=ax)
            pylab.grid()
        # original is 'aquamarine4','cyan2','cornflowerblue    ','aquamarine'),
        return df

    def __str__(self):
        self.df.info()
        return ""

    def create_html_associations(self):
        """Create an HTML page for each significant association

        The name of the output HTML file is **<association id>.html**
        where association id is stored in :attr:`df`.

        """
        if self.verbose:
            print(
                "Creating individual HTML pages for each significant association"
            )
        df = self.get_significant_set()

        drugs = df['DRUG_ID'].values
        features = df['FEATURE'].values
        assocs = df['ASSOC_ID'].values
        fdrs = df['ANOVA_FEATURE_FDR'].values

        N = len(df)
        pb = Progress(N)

        html = Association(self,
                           drug='dummy',
                           feature='dummy',
                           fdr='dummy',
                           company=self.company)

        for i in range(N):
            html.drug = drugs[i]
            html.feature = features[i]
            if str(assocs[i]).startswith("a"):
                html._filename = str(assocs[i]) + '.html'
            else:
                html._filename = "a" + str(assocs[i]) + '.html'
            html.fdr = fdrs[i]
            html.assoc_id = assocs[i]
            #html._init_report() # since we have one shared instance
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i + 1)
        if self.settings.animate: print("\n")

    def create_html_features(self):
        """Create an HTML page for each significant feature"""
        df = self.get_significant_set()
        groups = df.groupby('FEATURE')
        if self.verbose:
            print("Creating individual HTML pages for each feature")
        N = len(groups.indices.keys())
        pb = Progress(N)
        for i, feature in enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            subdf = groups.get_group(feature)
            html = HTMLOneFeature(self, self.df, subdf, feature)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i + 1)
        if self.settings.animate: print("\n")

    def create_html_drugs(self):
        """Create an HTML page for each drug"""
        # group by drugs
        all_drugs = list(self.df['DRUG_ID'].unique())

        df = self.get_significant_set()
        groups = df.groupby('DRUG_ID')
        if self.verbose:
            print("Creating individual HTML pages for each drug")
        N = len(groups.indices.keys())
        N = len(all_drugs)
        pb = Progress(N)
        for i, drug in enumerate(all_drugs):
            # enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            if drug in groups.groups.keys():
                subdf = groups.get_group(drug)
            else:
                subdf = {}

            html = HTMLOneDrug(self, self.df, subdf, drug)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i + 1)
        if self.settings.animate: print("\n")

    def create_html_main(self, onweb=False):
        """Create HTML main document (summary)"""
        self._set_sensible_df()

        if self.verbose:
            print("Creating main HTML page in directory %s" %
                  (self.settings.directory))
        ReportMain(directory=self.settings.directory, verbose=self.verbose)

        buffer_ = self.settings.savefig
        self.settings.savefig = True
        html = HTMLPageMain(self, 'index.html')
        html._init_report()  # created the directory
        html.create_report(onweb=onweb)
        self.settings.savefig = buffer_

    def create_html_manova(self, onweb=True):
        """Create summary table with all significant hits

        :param onweb: open browser with the created HTML page.

        """
        df = self.get_significant_set()
        page = HTMLPageMANOVA(self, df, self.company)
        page.create_report(onweb)

    def create_html_pages(self, onweb=True):
        """Create all HTML pages"""
        self.create_html_main(onweb=onweb)
        self.create_html_manova(onweb=False)
        self.create_html_drugs()
        self.create_html_features()
        self.create_html_associations()

    def onweb(self):
        from easydev import onweb
        onweb(self.settings.directory + os.sep + 'index.html')
예제 #22
0
    def add_settings(self):
        # -------------------------------------- settings and INPUT files
        input_dir = self.directory + os.sep + 'INPUT'
        filename = 'ANOVA_input.csv'
        filename = os.sep.join([input_dir, filename])
        self.report.gdsc.ic50.to_csv(filename)
        filename = os.sep.join(['INPUT', 'ANOVA_input.csv'])
        self.jinja['ic50_file'] = filename

        # the genomic features, which may be the default version
        # one provided by the user. It may have been changed
        gf_filename = os.sep.join([input_dir, 'genomic_features.csv'])
        self.report.gdsc.features.to_csv(gf_filename)
        html = """Saved <a href="INPUT/genomic_features.csv">Genomic
                  Features</a> file<br/> (possibly the default
                  version)."""
        self.jinja['gf_file'] = html

        # Always save DRUG_DECODE file even if empty
        # It may be be interpreted in other pipeline or for reproducibility
        output_filename = input_dir + os.sep + 'DRUG_DECODE.csv'
        self.report.drug_decode.to_csv(output_filename)
        html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>'
        if len(self.report.drug_decode) == 0:
            html += 'Note that DRUG_DECODE file was not provided (empty?).'
        self.jinja['drug_decode'] = html

        # Save settings as json file
        filename = os.sep.join([input_dir, 'settings.json'])
        self.settings.to_json(filename)
        filename = os.path.basename(filename)
        self.jinja['settings'] = \
                """Get the settings as a <a href="INPUT/%s">
                json file</a>.""" % filename

        # Save all Results dataframe
        filename = os.sep.join(
            [self.settings.directory, 'OUTPUT', 'results.csv'])
        ANOVAResults(self.report.df).to_csv(filename)

        code = """from gdsctools import *
import os

def getfile(filename, where='../INPUT'):
    return os.sep.join([where, filename])

# reback the IC50 and genomic features matrices
gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'),
        getfile('DRUG_DECODE.csv'))
gdsc.settings.from_json(getfile('settings.json'))
gdsc.init()

# Analyse the data
results = gdsc.anova_all()

# Create the HTML report
r = ANOVAReport(gdsc, results)
r.create_html_pages(onweb=False)"""
        code = code % {
            'ic50': 'ANOVA_input.csv',
            'gf_filename': 'genomic_features.csv'
        }

        filename = os.sep.join([self.settings.directory, 'code', 'rerun.py'])
        fh = open(filename, 'w')
        fh.write(code)
        fh.close()
예제 #23
0
    def create_data_packages_for_companies(self, companies=None):
        """Creates a data package for each company found in the DrugDecode file
        """
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################

        # companies must be just one name (one string) or a list of strings
        # By default, takes all companies found in DrugDecode
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        if len(companies) == 0:
            raise ValueError("Could not find any companies in the DrugDecode file")

        # The main directory
        self.mkdir(self.company_directory)

        # Loop over all companies, retrieving information built
        # in analyse() method, selecting for each TCGA all information
        # for that company only (and public drugs)
        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print(purple("\n=========== Analysing company %s out of %s (%s)" %
                    (ii+1, Ncomp, company)))
            self.mkdir(self.company_directory + os.sep + company)

            # Handle each TCGA case separately
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print(brown("  ------- building TCGA %s sub directory" % tcga))

                # Read the results previously computed either
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "%s/%s/OUTPUT/results.csv" % (self.main_directory, tcga)
                    results_df = ANOVAResults(results_path)


                # MAke sure the results are formatted correctly
                results = ANOVAResults(results_df)

                # Get the DrugDecode information for that company only
                drug_decode_company = self.drug_decode.df.query(
                        "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)

                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # Filter the results to keep only public drugs and that
                # company. Make sure this is integers
                results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int)

                mask = [True if x in drug_decode_company.df.index else False
                        for x in results.df.DRUG_ID]

                results.df = results.df.ix[mask]

                # We read the IC50 again
                try:
                    self.ic50 = IC50(self.ic50_filename)
                except:
                    self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)

                # And create an ANOVA instance. This is not to do the analyse
                # again but to hold various information
                an = ANOVA(self.ic50, gf_filename, drug_decode_company,
                    verbose=False)

                def drug_to_keep(drug):
                    to_keep = drug in drug_decode_company.df.index
                    return to_keep
                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga
                an.settings.analysis_type = tcga

                # Now we create the report
                self.report = ANOVAReport(an, results,
                        drug_decode=drug_decode_company,
                        verbose=self.verbose)
                self.report.company = company
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)
                self.report.create_html_features()
                self.report.create_html_drugs()
                self.report.create_html_associations()
예제 #24
0
파일: anova.py 프로젝트: Donnyvdm/gdsctools
    def anova_one_drug(self, drug_id, animate=True, output='object'):
        """Computes ANOVA for a given drug across all features

        :param str drug_id: a valid drug identifier.
        :param animate: shows the progress bar
        :return: a dataframe

        Calls :meth:`anova_one_drug_one_feature` for each feature.
        """
        # drop first and second columns that are made of strings
        # works under python2 but not python 3. Assume that the 2 first
        #columns are the sample name and tissue feature
        # Then, we keep only cases with at least 3 features.
        # MSI could be used but is not like in original R code.
        features = self.features.df.copy()
        # need to skip the FACTOR to keep only features
        shift = self.features.shift

        features = features[features.columns[shift:]]
        # FIXME what about features with less than 3 zeros ?
        mask = features.sum(axis=0) >= 3

        # TODO: MSI, tissues, name must always be kept
        #
        selected_features = features[features.columns[mask]]

        # scan all features for a given drug
        assert drug_id in self.ic50.df.columns
        N = len(selected_features.columns)
        pb = Progress(N, 10)
        res = {}
        #
        for i, feature in enumerate(selected_features.columns):
            # production True, means we do not want to create a DataFrame
            # for each call to the anova_one_drug_one_feature function
            # Instead, we require dictionaries
            this = self.anova_one_drug_one_feature(drug_id,
                                                   feature,
                                                   production=True)
            if this['ANOVA_FEATURE_pval'] is not None:
                res[feature] = this
            if animate is True:
                pb.animate(i + 1)

        # if production is False:
        # df = pid.concat(res, ignore_index=True)
        df = pd.DataFrame.from_records(res)
        df = df.T

        df = ANOVAResults().astype(df)
        if len(df) == 0:
            return df

        # append DRUG_NAME/DRUG_TARGET columns
        df = self.drug_decode.drug_annotations(df)

        # TODO: drop rows where ANOVA_FEATURE_PVAL is None
        if output != 'object':
            df = self.add_pvalues_correction(df)
            return df
        else:
            df = self.add_pvalues_correction(df)
            res = ANOVAResults(df, self.settings)
            res.settings = ANOVASettings(**self.settings)
            return res
예제 #25
0
    def _create_summary_pages(self,
                              main_directory,
                              verbose=True,
                              company=None):
        # Read all directories in tissue_packages

        directories = glob.glob(main_directory + os.sep + '*')

        summary = []
        for directory in sorted(directories):
            tcga = directory.split(os.sep)[-1]
            if tcga not in self.tcga:
                continue
            if verbose:
                print(directory, tcga)
            # number of hits
            path = directory + os.sep + 'OUTPUT' + os.sep
            try:
                hits = pd.read_csv(path + 'drugs_summary.csv', sep=',')
            except:
                summary.append([tcga] + [None] * 5)
                continue
            total_hits = hits.total.sum()

            drug_involved = hits['Unnamed: 0'].unique()

            results = ANOVAResults(path + 'results.csv')
            if len(results) > 0:
                drug_ids = results.df.DRUG_ID.unique()
            else:
                drug_ids = []

            # where to find the DRUG DECODE file. Should
            # have been copied
            path = directory + os.sep + 'INPUT' + os.sep
            drug_decode = DrugDecode(path + 'DRUG_DECODE.csv')
            info = drug_decode.get_info()

            webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE
            drug_inv_public = sum(webrelease == 'Y')
            drug_inv_prop = sum(webrelease != 'Y')

            summary.append([
                tcga, total_hits, drug_inv_prop, info['N_prop'],
                drug_inv_public, info['N_public']
            ])
        df = pd.DataFrame(summary)

        df.columns = [
            'Analysis name', 'Number of hits',
            'Number of involved proprietary compounds', 'out of',
            'Number of involved public', 'out of'
        ]

        try:
            df.sort_values(by="Number of hits", ascending=False, inplace=True)
        except:
            df.sort("Number of hits", ascending=False, inplace=True)

        output_dir = main_directory + os.sep + '..' + os.sep
        output_file = output_dir + os.sep + 'index.html'
        self.html_page = ReportMain(directory=main_directory,
                                    filename='index.html',
                                    template_filename='datapack_summary.html',
                                    mode="summary")

        # Let us use our HTMLTable to add the HTML references
        self.html_table = HTMLTable(df)
        self.html_table.add_href('Analysis name',
                                 newtab=True,
                                 url=None,
                                 suffix='/index.html')
        self.html_table.add_bgcolor('Number of hits')

        self.html_page.jinja['data_table'] = self.html_table.to_html(
            collapse_table=False)
        if company:
            self.html_page.jinja["collaborator"] = company

        self.html_page.write()

        return df
예제 #26
0
    def create_data_packages_for_companies(self, companies=None):
        """Creates a data package for each company found in the DrugDecode file
        """
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################

        # companies must be just one name (one string) or a list of strings
        # By default, takes all companies found in DrugDecode
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        if len(companies) == 0:
            raise ValueError(
                "Could not find any companies in the DrugDecode file")

        # The main directory
        self.mkdir(self.company_directory)

        # Loop over all companies, retrieving information built
        # in analyse() method, selecting for each TCGA all information
        # for that company only (and public drugs)
        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print(
                purple("\n=========== Analysing company %s out of %s (%s)" %
                       (ii + 1, Ncomp, company)))
            self.mkdir(self.company_directory + os.sep + company)

            # Handle each TCGA case separately
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print(brown("  ------- building TCGA %s sub directory" % tcga))

                # Read the results previously computed either
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "%s/%s/OUTPUT/results.csv" % (
                        self.main_directory, tcga)
                    results_df = ANOVAResults(results_path)

                # MAke sure the results are formatted correctly
                results = ANOVAResults(results_df)

                # Get the DrugDecode information for that company only
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)

                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # Filter the results to keep only public drugs and that
                # company. Make sure this is integers
                results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in results.df.DRUG_ID
                ]

                results.df = results.df.ix[mask]

                # We read the IC50 again
                try:
                    self.ic50 = IC50(self.ic50_filename)
                except:
                    self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)

                # And create an ANOVA instance. This is not to do the analyse
                # again but to hold various information
                an = ANOVA(self.ic50,
                           gf_filename,
                           drug_decode_company,
                           verbose=False)

                def drug_to_keep(drug):
                    to_keep = drug in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga
                an.settings.analysis_type = tcga

                # Now we create the report
                self.report = ANOVAReport(an,
                                          results,
                                          drug_decode=drug_decode_company,
                                          verbose=self.verbose)
                self.report.company = company
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)
                self.report.create_html_features()
                self.report.create_html_drugs()
                self.report.create_html_associations()
예제 #27
0
    def anova_all(self, animate=True, drugs=None, multicore=None):
        """Run all ANOVA tests for all drugs and all features.

        :param drugs: you may select a subset of drugs
        :param animate: shows the progress bar
        :return: an :class:`~gdsctools.anova_results.ANOVAResults`
            instance with the dataframe
            stored in an attribute called **df**

        Calls :meth:`anova_one_drug` for each drug and concatenate all
        results together. Note that once all data are gathered,
        :meth:`add_pvalues_correction` is called to fill a new column
        with FDR corrections.

        An extra column  named "ASSOC_ID" is also added with
        a unique identifer sorted by ascending FDR.

        .. note:: A thorough comparison with version v17 gives the same FDR
            results (difference ~1e-6); Note however that the qvalue results
            differ by about 0.3% due to different smoothing in R and Python.
        """
        if self.verbose and len(self.individual_anova):
            print("Reusing some results from the buffer. " "To reset the buffer, call reset_buffer() method")
        # drop DRUG where number of IC50 (non-null) is below 5
        # axis=0 is default but we emphasize that sum is over
        # column (i.e. drug
        vv = (self.ic50.df.isnull() == False).sum(axis=0)
        # FIXME: should be in one_drug_one_feature ??
        drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50]

        # if user provided a list of drugs, use them:
        if drugs is not None:
            # todo: check valifity of the drug names
            drug_names = drugs[:]

        pb = Progress(len(drug_names), 1)
        drug_names = list(drug_names)
        # pylab.shuffle(drug_names) # ? why

        if animate is True:
            pb.animate(0)

        if multicore:
            # Note that here, we do not use the buffer
            multicore_analysis(self, drug_names, multicore)
        else:

            for i, drug_name in enumerate(drug_names):
                if drug_name in self.individual_anova.keys():
                    pass
                else:
                    res = self.anova_one_drug(drug_name, animate=False, output="dataframe")
                    self.individual_anova[drug_name] = res
                if animate is True:
                    pb.animate(i + 1)
        print("\n")
        if len(self.individual_anova) == 0:
            return ANOVAResults()

        df = pd.concat(self.individual_anova, ignore_index=True)

        if len(df) == 0:
            return df
        # sort all data by ANOVA p-values
        try:
            df.sort_values("ANOVA_FEATURE_pval", inplace=True)
        except:
            df.sort("ANOVA_FEATURE_pval", inplace=True)

        # all ANOVA have been computed individually for each drug and each
        # feature. Now, we need to compute the multiple testing corrections
        if self.settings.pvalue_correction_level == "global":
            df = self.add_pvalues_correction(df)
        else:
            pass

        # insert a unique identifier as first column
        df.insert(0, "ASSOC_ID", range(1, len(df) + 1))

        self.df = df
        # order the column names as defined in the __init__ method
        df = df[self.column_names]
        df.reset_index(inplace=True, drop=True)

        results = ANOVAResults()
        results.df = df
        results.settings = ANOVASettings(**self.settings)
        return results
예제 #28
0
    def anova_one_drug(self, drug_id, animate=True, output="object"):
        """Computes ANOVA for a given drug across all features

        :param str drug_id: a valid drug identifier.
        :param animate: shows the progress bar
        :return: a dataframe

        Calls :meth:`anova_one_drug_one_feature` for each feature.
        """
        # drop first and second columns that are made of strings
        # works under python2 but not python 3. Assume that the 2 first
        # columns are the sample name and tissue feature
        # Then, we keep only cases with at least 3 features.
        # MSI could be used but is not like in original R code.
        features = self.features.df.copy()
        # need to skip the FACTOR to keep only features
        shift = self.features.shift

        features = features[features.columns[shift:]]
        # FIXME what about features with less than 3 zeros ?
        mask = features.sum(axis=0) >= 3

        # TODO: MSI, tissues, name must always be kept
        #
        selected_features = features[features.columns[mask]]

        # scan all features for a given drug
        assert drug_id in self.ic50.df.columns
        N = len(selected_features.columns)
        pb = Progress(N, 10)
        res = {}
        #
        for i, feature in enumerate(selected_features.columns):
            # production True, means we do not want to create a DataFrame
            # for each call to the anova_one_drug_one_feature function
            # Instead, we require dictionaries
            this = self.anova_one_drug_one_feature(drug_id, feature, production=True)
            if this["ANOVA_FEATURE_pval"] is not None:
                res[feature] = this
            if animate is True:
                pb.animate(i + 1)

        # if production is False:
        # df = pid.concat(res, ignore_index=True)
        df = pd.DataFrame.from_records(res)
        df = df.T

        df = ANOVAResults().astype(df)
        if len(df) == 0:
            return df

        # append DRUG_NAME/DRUG_TARGET columns
        df = self.drug_decode.drug_annotations(df)

        # TODO: drop rows where ANOVA_FEATURE_PVAL is None
        if output != "object":
            df = self.add_pvalues_correction(df)
            return df
        else:
            df = self.add_pvalues_correction(df)
            res = ANOVAResults(df)
            res.settings = ANOVASettings(**self.settings)
            return res
예제 #29
0
    def _create_report(self, onweb=True):
        # A summary table
        diag = self.report.diagnostics()
        table = HTMLTable(diag, 'summary')
        txt = ''
        for index, row in diag.iterrows():
            if len(row.text) == 0 and len(row.value) == 0:
                txt += '----<br/>'
            else:
                txt += row.text + ": " + str(row.value) + "<br/>"
        self.jinja['summary'] = txt

        print('Creating volcano plots')
        # this can be pretty slow. so keep only 1000 most relevant
        # values and 1000 random ones to get an idea of the distribution
        v = VolcanoANOVA(self.report.df, settings=self.settings)
        v.selector(v.df, 1500, 1500, inplace=True)
        v.volcano_plot_all()
        v.savefig_and_js("volcano_all_js")

        self.jinja['volcano'] = """
            <h3></h3>
            <a href="volcano_all_js.html">
                <img alt="volcano plot for all associations"
                    src="volcano_all_js.png">
            </a>
            <br/>
            <p> A javascript version is available
                <a href="volcano_all_js.html">here</a> (
                or click on the image).</p>
        """

        # MANOVA link
        N = len(self.report.get_significant_set())
        self.jinja['manova'] = """
        There were %(N)s significant associations found.
        All significant associations have been gatherered
        in the following link: <br/><a href="manova.html">manova results</a>.
        """ % {
            'N': N
        }

        # feature summary
        df_features = self.report.feature_summary("feature_summary.png")
        filename = 'OUTPUT' + os.sep + 'features_summary.csv'
        df_features.to_csv(self.directory + os.sep + filename, sep=',')

        # drug summary
        #not_tested = [x for x in self.report.gdsc.drugIds if x not in
        #        self.report.df.DRUG_ID.unique()]
        #if len(not_tested) > 0:
        #    not_tested = """%s drugs were not analysed due to
        #    lack of valid data points: """ % len(not_tested) + \
        #            ", ".join(not_tested)
        #else:
        #    not_tested = ""
        not_tested = ""
        self.jinja['drug_not_tested'] = not_tested

        df_drugs = self.report.drug_summary(filename="drug_summary.png")
        get_name = self.report.drug_decode.get_name
        if len(self.report.drug_decode.df) > 0:
            df_drugs.index = [x + "-" + get_name(x) for x in df_drugs.index]
        filename = 'OUTPUT' + os.sep + 'drugs_summary.csv'
        df_drugs.to_csv(self.directory + os.sep + filename, sep=',')

        # --------------------------- Create table with links to all drugs
        groups = self.report.df.groupby('DRUG_ID')
        try:
            df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values()
        except:
            # note double brackets for pythonn3.3
            df = groups.mean()[['ANOVA_FEATURE_FDR']].sort()
        df = df.reset_index()  # get back the Drug id in the dframe columns

        # let us add also the drug name
        df = self.report.drug_decode.drug_annotations(df)

        # let us also add number of associations computed
        counts = [len(groups.groups[k]) for k in df.DRUG_ID]
        df['Number of associations computed'] = counts
        groups = self.report.get_significant_set().groupby('DRUG_ID').groups
        count = []
        for drug in df['DRUG_ID'].values:
            if drug in groups.keys():
                count.append(len(groups[drug]))
            else:
                count.append(0)
        df['hits'] = count

        # add another set of drug_id but sorted in alpha numerical order
        table = HTMLTable(df, 'drugs')
        table.add_href('DRUG_ID')
        table.df.columns = [
            x.replace('ANOVA_FEATURE_FDR', 'mean FEATURE ANOVA FDR')
            for x in table.df.columns
        ]
        table.add_bgcolor('hits',
                          mode='max',
                          cmap=cmap_builder('white', 'orange', 'red'))

        self.jinja['drug_table'] = table.to_html(escape=False,
                                                 header=True,
                                                 index=False)

        # ---------------------- Create full table with links to all features
        df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()})
        try:
            df.sort_values(by='FEATURE', inplace=True)
        except:
            df.sort('FEATURE', inplace=True)

        groups = self.report.get_significant_set().groupby('FEATURE').groups

        count = []
        for feature in df['FEATURE'].values:
            if feature in groups.keys():
                count.append(len(groups[feature]))
            else:
                count.append(0)
        df['hits'] = count

        table = HTMLTable(df, 'features')
        table.sort('hits', ascending=False)
        table.add_href('FEATURE')
        table.add_bgcolor('hits',
                          mode='max',
                          cmap=cmap_builder('white', 'orange', 'red'))
        self.jinja['feature_table'] = table.to_html(escape=False,
                                                    header=True,
                                                    index=False)

        # -------------------------------------- COSMIC table for completeness
        colnames = self.report.gdsc.features._special_names
        df = self.report.gdsc.features.df[colnames]

        # TODO
        # add other columns if possible e.g., GDSC1, GDSC2, TCGA

        df = df.reset_index()
        table = HTMLTable(df)
        url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id="
        table.add_href('COSMIC_ID', url=url, newtab=True)
        self.jinja['cosmic_table'] = table.to_html()

        # -------------------------------------- settings and INPUT files
        input_dir = self.directory + os.sep + 'INPUT'
        filename = 'ANOVA_input.csv'
        filename = os.sep.join([input_dir, filename])
        self.report.gdsc.ic50.to_csv(filename)
        filename = os.sep.join(['INPUT', 'ANOVA_input.csv'])
        self.jinja['ic50_file'] = filename

        # the genomic features, which may be the default version
        # one provided by the user. It may have been changed
        gf_filename = os.sep.join([input_dir, 'genomic_features.csv'])
        self.report.gdsc.features.to_csv(gf_filename)
        html = """Saved <a href="INPUT/genomic_features.csv">Genomic
                  Features</a> file<br/> (possibly the default
                  version)."""
        self.jinja['gf_file'] = html

        # Always save DRUG_DECODE file even if empty
        # It may be be interpreted in other pipeline or for reproducibility
        output_filename = input_dir + os.sep + 'DRUG_DECODE.csv'
        self.report.drug_decode.to_csv(output_filename)
        html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>'
        if len(self.report.drug_decode) == 0:
            html += 'Note that DRUG_DECODE file was not provided (empty?).'
        self.jinja['drug_decode'] = html

        # Save settings as json file
        filename = os.sep.join([input_dir, 'settings.json'])
        self.settings.to_json(filename)
        filename = os.path.basename(filename)
        self.jinja['settings'] = \
                """Get the settings as a <a href="INPUT/%s">
                json file</a>.""" % filename

        # Save all Results dataframe
        filename = os.sep.join(
            [self.settings.directory, 'OUTPUT', 'results.csv'])
        ANOVAResults(self.report.df).to_csv(filename)

        code = """from gdsctools import *
import os

def getfile(filename, where='../INPUT'):
    return os.sep.join([where, filename])

# reback the IC50 and genomic features matrices
gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'),
        getfile('DRUG_DECODE.csv'))
gdsc.settings.from_json(getfile('settings.json'))
gdsc.init()

# Analyse the data
results = gdsc.anova_all()

# Create the HTML report
r = ANOVAReport(gdsc, results)
r.create_html_pages(onweb=False)"""
        code = code % {
            'ic50': 'ANOVA_input.csv',
            'gf_filename': 'genomic_features.csv'
        }

        filename = os.sep.join([self.settings.directory, 'code', 'rerun.py'])
        fh = open(filename, 'w')
        fh.write(code)
        fh.close()
예제 #30
0
    def anova_all(self, animate=True, drugs=None):
        """Run all ANOVA tests for all drugs and all features.

        :param drugs: you may select a subset of drugs
        :param animate: shows the progress bar
        :return: an :class:`~gdsctools.anova_results.ANOVAResults`
            instance with the dataframe
            stored in an attribute called **df**

        Loops over all drugs calling :meth:`anova_one_drug` for each
        drug and concatenating all results together. Note that once all
        data are gathered, an extra column containing the FDR corrections
        is added to the dataframe using :meth:`add_pvalues_correction`
        method. An extra column  named "ASSOC_ID" is also added with
        a unique identifer sorted by ascending FDR.

        .. note:: A thorough comparison with version v17 give the same FDR
            results (difference ~1e-6); Note however that the qvalue results
            differ by about 0.3% due to different smoothing in R and Python.
        """
        # drop DRUG where number of IC50 (non-null) is below 5
        # axis=0 is default but we emphasize that sum is over
        # column (i.e. drug
        vv = (self.ic50.df.isnull() == False).sum(axis=0)
        # FIXME: should be in one_drug_one_feature ??
        drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50]

        # if user provided a list of drugs, use them:
        if drugs is not None:
            # todo: check valifity of the drug names
            drug_names = drugs[:]

        pb = Progress(len(drug_names), 1)
        drug_names = list(drug_names)
        pylab.shuffle(drug_names)
        if animate is True:
            pb.animate(0)

        for i, drug_name in enumerate(drug_names):
            if drug_name in self.individual_anova.keys():
                pass
            else:
                res = self.anova_one_drug(drug_name,
                                          animate=False,
                                          output='dataframe')
                self.individual_anova[drug_name] = res
            if animate is True:
                pb.animate(i + 1)
        print("\n")
        if len(self.individual_anova) == 0:
            return ANOVAResults()

        df = pd.concat(self.individual_anova, ignore_index=True)

        if len(df) == 0:
            return df
        # sort all data by ANOVA p-values
        try:
            df.sort_values('ANOVA_FEATURE_pval', inplace=True)
        except:
            df.sort('ANOVA_FEATURE_pval', inplace=True)

        # all ANOVA have been computed individually for each drug and each
        # feature. Now, we need to compute the multiple testing corrections
        if self.settings.pvalue_correction_level == 'global':
            df = self.add_pvalues_correction(df)

        # insert a unique identifier as first column
        df.insert(0, 'ASSOC_ID', range(1, len(df) + 1))

        self.df = df
        # order the column names as defined in the __init__ method
        df = df[self.column_names]
        df.reset_index(inplace=True, drop=True)

        results = ANOVAResults()
        results.df = df
        results.settings = ANOVASettings(**self.settings)
        return results
예제 #31
0
    def __init__(self, ic50, genomic_features=None,
            drug_decode=None, verbose=True, low_memory=True,
            set_media_factor=False):
        """.. rubric:: Constructor

        :param DataFrame IC50: a dataframe with the IC50. Rows should be
            the COSMIC identifiers and columns should be the Drug names
            (or identifiers)
        :param features: another dataframe with rows as in the IC50 matrix
            and columns as features.  The first 3 columns must be named
            specifically to hold tissues, MSI (see format).
        :param drug_decode: a 3 column CSV file with drug's name and targets
            see :mod:`readers` for more information.
        :param verbose: verbosity in "WARNING", "ERROR", "DEBUG", "INFO"

        The attribute :attr:`settings` contains specific settings related
        to the analysis or visulation.
        """
        self.verbose = verbose
        self._init_called = False

        # We first need to read the IC50 using a dedicated reader
        self.ic50 = readers.IC50(ic50)

        # Create a dictionary version of the data
        # to be accessed per drug where NA have already been
        # removed. Each drug is a dictionary with 2 keys:
        # Y for the data and indices for the cosmicID where
        # there is an IC50 measured.
        ic50_parse = self.ic50.df.copy().unstack().dropna()
        self.ic50_dict = dict([(d, {'indices': ic50_parse.ix[d].index,
            'Y':ic50_parse.ix[d].values}) for d in self.ic50.drugIds])

        # Reads features if provided, otherwise use a default data set
        if genomic_features is None:
            # Reads default version provided with the package
            self.features = readers.GenomicFeatures()
        else:
            self.features = readers.GenomicFeatures(genomic_features)

        if self.features.found_media is False and \
                set_media_factor is True:
            if self.verbose:
                print('Populating MEDIA Factor in the Genomic Feature matrix')
            self.features.fill_media_factor()


        #: a CSV with 3 columns used in the report
        self.read_drug_decode(drug_decode)

        # create the multiple testing factory used in anova_all()
        self.multiple_testing = MultipleTesting()

        # We prune the genomic features by settings the cosmic ids of
        # the features to be those of the cosmic ids of the IC50. See
        # readers module. This affectation, prune the features dataframe
        # automatically. This fails if a cosmic identifier is not
        # found in the features' cosmic ids, so let us catch the error
        # before hand to give a
        unknowns = set(self.ic50.cosmicIds).difference(
                set(self.features.cosmicIds))

        if len(unknowns) > 0:
            print("WARNING: " +
                "%s cosmic identifiers in your IC50 " % len(unknowns) +
                "could not be found in the genomic feature matrix. "+
                "They will be dropped. Consider using a user-defined " +
                "genomic features matrix")

        self.ic50.drop_cosmic(list(unknowns))
        self.features.cosmicIds = self.ic50.cosmicIds
        #self.cosmicIds = self.ic50.cosmicIds

        #: an instance of :class:`~gdsctools.settings.ANOVASettings`
        self.settings = ANOVASettings()
        self.settings.low_memory = low_memory

        # alias to all column names to store results
        # cast to list (Python3).
        self.column_names = list(ANOVAResults().mapping.keys())

        # skip assoc_id for now
        self._odof_dict = dict([(name, None)
            for name in self.column_names])

        # a cache to store ANOVA results for each drug
        self.individual_anova = {}

        # must be called if ic50 or features are changed.
        self.init()
예제 #32
0
class ANOVAReport(object):
    """Class used to interpret the results and create final HTML report

    Results is a data structure returned by :meth:`ANOVA.anova_all`.

    ::

        from gdsctools import *

        # Perform the analysis itself to get a set of results (dataframe)
        an = ANOVA(ic50_test)
        results = an.anova_all()

        # now, we can create the report.
        r = ANOVAReport(gdsc=an, results=results)

        # we can tune some settings
        r.settings.pvalue_threshold = 0.001
        r.settings.FDR_threshold = 28
        r.settings.directory = 'testing'
        r.create_html_pages()


    .. rubric:: Significant association

    An association is significant if

      - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold
      - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold
    It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is 
    below 0, otherwise it is **resistant**.


    """
    def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, 
            verbose=True):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`. If
            not provided, the ANOVA is run on the fly.

        """
        self.verbose = verbose
        self.figtools = Savefig(verbose=False)
        self.gdsc = gdsc

        if results is None:
            results = gdsc.anova_all()
        self.df = ANOVAResults(results).df # this does a copy and sanity check


        # Make sure the DRUG are integers
        self.df.DRUG_ID = self.df.DRUG_ID.astype(int)


        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)
        #if sum(self.df == np.inf).sum()>0:
        #    print("WARNING: infinite values were found in your results... Set to zero")
        try:
            self.df = self.df.replace({np.inf:0, -np.inf:0})
        except:
            pass

        # create some data
        self._set_sensible_df()

        self.company = None

        # just to create the directory
        # ReportMain(directory=self.settings.directory, verbose=self.verbose)

    def _get_ndrugs(self):
        return len(self.df[self._colname_drug_id].unique())
    n_drugs = property(_get_ndrugs, doc="return number of drugs")

    def _get_ntests(self):
        return len(self.df.index)
    n_tests = property(_get_ntests)

    def _get_ncelllines(self):
        return len(self.gdsc.features.df.index)
    n_celllines = property(_get_ncelllines,
            doc="return number of cell lines")

    def _df_append(self, df, data):
        count = len(df)
        df.ix[count] = data
        return df

    def diagnostics(self):
        """Return summary of the analysis (dataframe)"""
        self._set_sensible_df()

        df = pd.DataFrame({'text': [], 'value': []})

        n_features = len(self.gdsc.features.df.columns)
        n_features -= self.gdsc.features.shift
        n_drugs = len(self.df[self._colname_drug_id].unique())

        N = float(n_drugs * n_features)

        if N == 0:
            ratio = 0
        else:
            ratio = float(self.n_tests)/(N) * 100

        try:
            ratio = easydev.precision(ratio, digit=2)
        except:
            # Fixme: this is a hack for the infinite values but should not
            # happen...
            ratio = 0

        msg = "Type of analysis"
        df = self._df_append(df, [msg, self.settings.analysis_type])

        msg = "Total number of possible drug/feature associations"
        df = self._df_append(df, [msg, int(N)])
        msg = "Total number of ANOVA tests performed"
        df = self._df_append(df, [msg, self.n_tests])
        msg = "Percentage of tests performed"
        df = self._df_append(df, [msg, ratio])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])

        msg = "Total number of tested drugs"
        df = self._df_append(df, [msg, n_drugs])
        msg = "Total number of genomic features used"
        df = self._df_append(df, [msg, n_features])

        msg = "Total number of screened cell lines"
        df = self._df_append(df, [msg, self.n_celllines])

        msg = "MicroSatellite instability included as factor"
        msi = self.settings.include_MSI_factor
        df = self._df_append(df, [msg, msi])

        # trick to have an empty line
        df = self._df_append(df, ["", ""])
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        msg = "Total number of significant associations"
        df = self._df_append(df, [msg, nsens+nres])
        msg = " - sensitive"
        df = self._df_append(df, [msg, nsens])
        msg = " - resistant"
        df = self._df_append(df, [msg, nres])

        msg = "p-value significance threshold"
        df = self._df_append(df, [msg, self.settings.pvalue_threshold])

        msg = "FDR significance threshold"
        df = self._df_append(df, [msg, self.settings.FDR_threshold])

        p1, p2 = self._get_pval_range()
        msg = 'Range of significant p-values'
        value = "[{:.4}, {:.4}]".format(p1, p2)
        df = self._df_append(df, [msg, value])

        f1, f2 = self._get_fdr_range()
        msg = "Range of significant % FDRs"
        value = '[{:.4} {:.4}]'.format(f1, f2)
        df = self._df_append(df, [msg, value])
        return df

    def _get_pval_range(self):
        """Get pvalues range of the significant hits"""
        nsens = len(self.sensible_df)
        nres = len(self.resistant_df)
        N = nsens + nres
        if N == 0:
            return 0., 0.
        name = self.varname_pval
        data = self.df[name].ix[0:N-1]
        m, M = data.min(), data.max()
        return m, M

    def _get_fdr_range(self):
        """Get FDR range of the significant hits"""
        name = self.varname_qval
        data = self.df[name][(self.df[name] < self.settings.FDR_threshold)]
        if len(data) == 0:
            return 0., 0.
        m, M = data.min(), data.max()
        return m, M

    def _set_sensible_df(self):
        # just an alias
        logand = np.logical_and

        # select sensible data set
        mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold
        mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold
        mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0
        self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)]

        # select resistant data set
        mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0
        self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)]

    def get_significant_set(self):
        """Return significant hits (resistant and sensible)"""
        # a property that is long to compute
        # and may change if FDR changes.
        self._set_sensible_df()
        df = pd.concat([self.sensible_df, self.resistant_df])
        try:
            df.sort_values('ASSOC_ID', inplace=True)
        except:
            df.sort('ASSOC_ID', inplace=True)
        return df

    def _get_data(self, df_count_sensible, df_count_resistant):
        # we can drop all columns except one, which is renamed as count
        df1 = df_count_sensible['ASSOC_ID']
        df1.name = 'sens assoc'
        df2 = df_count_resistant['ASSOC_ID']
        df2.name = 'res assoc'

        # Now, we join the two TimeSeries (note that above, we selected only
        # one column so the dataframe was downcast to time series)
        df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer')
        # and set NA to zero
        df_count.fillna(0, inplace=True)
        # let us add a new column with the total
        df_count['total'] = df_count['sens assoc'] + df_count['res assoc']

        # we want to sort by 'total' column and is equality by the name,
        # which is the index. So let us add the index temporarily as
        # a column, sort, and remove 'name' column afterwards
        df_count['name'] = df_count.index
        try:
            df_count.sort_values(by=['total', 'name'], 
                    ascending=[False, True], inplace=True)
        except:
            df_count.sort(columns=['total', 'name'], 
                    ascending=[False, True], inplace=True)

        df_count.drop('name', axis=1, inplace=True)
        return df_count

    def get_drug_summary_data(self):
        """Return dataframe with drug summary"""
        # get sensible and resistant sub dataframes
        self._set_sensible_df()

        # group by drug
        colname = self._colname_drug_id
        df_count_sensible = self.sensible_df.groupby(colname).count()
        df_count_resistant = self.resistant_df.groupby(colname).count()

        df_count = self._get_data(df_count_sensible, df_count_resistant)
        return df_count

    def drug_summary(self,  top=50, fontsize=15, filename=None):
        """Return dataframe with significant drugs and plot figure

        :param fontsize:
        :param top: max number of significant associations to show
        :param filename: if provided, save the file in the directory
        """

        df_count = self.get_drug_summary_data()

        if len(df_count):
            self._plot(df_count, 'drug', top)
            fig = pylab.gcf()
            self.figtools.directory = self.settings.directory
            self.figtools.savefig(filename, size_inches=(12, 14),
                    bbox_inches='tight')
        return df_count

    def get_feature_summary_data(self):
        """Return dataframe with feature summary"""
        # get sensible and resistant sub dataframes
        self._set_sensible_df()
        df_count_sensible = self.sensible_df.groupby('FEATURE').count()
        df_count_resistant = self.resistant_df.groupby('FEATURE').count()
        df_count = self._get_data(df_count_sensible, df_count_resistant)
        return df_count

    def feature_summary(self, filename=None, top=50, fontsize=15):
        """Return dataframe with significant features and plot figure

        :param fontsize:
        :param top: max number of significant associations to show
        :param filename: if provided, save the file in the directory
        """
        df_count = self.get_feature_summary_data()

        if len(df_count) > 0:
            self._plot(df_count, 'feature', top)
            #fig = pylab.gcf()
            self.figtools.directory = self.settings.directory
            self.figtools.savefig(filename, set_inches=(12, 14),
                    bbox_inches='tight')
        return df_count

    def _plot(self, df_count, title_tag, top):
        """Used by drug_summary and feature_summary to plot the
        bar plot"""
        if top > len(df_count):
            top = len(df_count)

        df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']]
        labels = list(df.index)
        # add drug name
        if len(self.drug_decode) > 0:
            for i, label in enumerate(labels):
                if title_tag == 'drug':
                    name = self.drug_decode.get_name(label)
                    if name is not None:
                        labels[i] = "{}-{}".format(labels[i], name)
                else:
                    pass

        labels = [str(x).replace('_', ' ') for x in labels]
        # restrict size to first 30 characters
        labels = [x[0:30] for x in labels]
        ind = range(0, len(labels))
        # reverse does not exist with python3
        try:
            ind.reverse()
        except:
            ind = list(ind)
            ind.reverse()
        data1 = df['sens assoc'].values
        data2 = df['res assoc'].values
        pylab.figure(1)
        pylab.clf()
        p1 = pylab.barh(ind, data1, height=0.8, color='purple',
                        label='sensitivity')
        p2 = pylab.barh(ind, data2, height=0.8, color='orange',
                        left=data1, label='resistance')
        ax = pylab.gca()
        self.labels = labels
        ax.set_yticks([x + 0.5 for x in ind])
        ax.set_yticklabels(labels, fontsize=12)

        xticks = ax.get_xticks()
        ax.set_xticklabels(
                [int(x) if divmod(x,1)[1] == 0 else "" for x in xticks])


        pylab.grid()
        pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \
                    "\nassociated with drug  response",
                    fontsize=self.settings.fontsize/1.2)
        pylab.xlabel(r'Number of significant associations (FDR %s %s %s) '
                     % ("$>$", self.settings.FDR_threshold, "$\%$"),
                     fontsize=18)

        M = max(data1+data2)
        #ax.set_xticks()
        #ax.set_xticklabels(labels, fontsize=fontsize)
        ax.set_xlim([0, M+1])
        pylab.legend(loc='lower right')
        try:pylab.tight_layout()
        except:pass

    def get_significant_hits(self,  show=True):
        """Return a summary of significant hits

        :param show: show a plot with the distribution of significant hits

        .. todo:: to finalise
        """
        fdrs = range(5, 50+1, 5)

        significants = []
        significant_meaningful = []
        strong_hits = []
        full_strong_hits = []

        MC1 = 1
        MC2 = 2
        mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1
        mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2
        mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1
        mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2
        maskMC = mask2 + mask3 + mask4 + mask5

        for fdr in fdrs:
            # significant hits
            res = self.df['ANOVA_FEATURE_FDR'] < fdr
            significants.append(res.sum())

            # meaningful hits
            indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr,
                    maskMC)
            significant_meaningful.append(indices.sum())

            # meaningful strong hits
            mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1
            mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1
            strong_hits.append(np.logical_or(mask1, mask2).sum())

            # meaningful full strong hits
            mask1 = self.df.ix[indices]['FEATURE_pos_Glass_delta'] >= 1
            mask2 = self.df.ix[indices]['FEATURE_neg_Glass_delta'] >= 1
            full_strong_hits.append(np.logical_and(mask1, mask2).sum())

        data = {'significants': significants,
                'full_strong_hits': full_strong_hits,
               'strong_hits': strong_hits,
               'significant_meaningful': significant_meaningful}

        df = pd.DataFrame(data, columns = ['significants',
            'significant_meaningful', 'strong_hits', 'full_strong_hits'],
            index=fdrs)
        df.columns = ['1) significant', '2) 1 + meaningful',
        '3) 2 + strong', '4) 2+ very strong']

        if show is True:
            pylab.clf()
            ax = pylab.gca()
            df.plot(kind='bar', width=.8,
                    color=['r', 'gray', 'orange', 'black'],
                    rot=0, ax=ax)
            pylab.grid()
        # original is 'aquamarine4','cyan2','cornflowerblue    ','aquamarine'),
        return df

    def __str__(self):
        self.df.info()
        return ""

    def create_html_associations(self):
        """Create an HTML page for each significant association

        The name of the output HTML file is **<association id>.html**
        where association id is stored in :attr:`df`.

        """
        if self.verbose:
            print("Creating individual HTML pages for each significant association")
        df = self.get_significant_set()

        drugs = df['DRUG_ID'].values
        features = df['FEATURE'].values
        assocs = df['ASSOC_ID'].values
        fdrs = df['ANOVA_FEATURE_FDR'].values

        N = len(df)
        pb = Progress(N)

        html = Association(self, drug='dummy', feature='dummy', fdr='dummy',
                company=self.company)

        for i in range(N):
            html.drug = drugs[i]
            html.feature = features[i]
            if str(assocs[i]).startswith("a"):
                html._filename = str(assocs[i]) + '.html'
            else:
                html._filename = "a" + str(assocs[i]) + '.html'
            html.fdr = fdrs[i]
            html.assoc_id = assocs[i]
            #html._init_report() # since we have one shared instance
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i+1)
        if self.settings.animate: print("\n")

    def create_html_features(self):
        """Create an HTML page for each significant feature"""
        df = self.get_significant_set()
        groups = df.groupby('FEATURE')
        if self.verbose:
            print("Creating individual HTML pages for each feature")
        N = len(groups.indices.keys())
        pb = Progress(N)
        for i, feature in enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            subdf = groups.get_group(feature)
            html = HTMLOneFeature(self, self.df, subdf, feature)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i+1)
        if self.settings.animate: print("\n")

    def create_html_drugs(self):
        """Create an HTML page for each drug"""
        # group by drugs
        all_drugs = list(self.df['DRUG_ID'].unique())

        df = self.get_significant_set()
        groups = df.groupby('DRUG_ID')
        if self.verbose:
            print("Creating individual HTML pages for each drug")
        N = len(groups.indices.keys())
        N = len(all_drugs)
        pb = Progress(N)
        for i, drug in enumerate(all_drugs):
            # enumerate(groups.indices.keys()):
            # get the indices and therefore subgroup
            if drug in groups.groups.keys():
                subdf = groups.get_group(drug)
            else:
                subdf = {}

            html = HTMLOneDrug(self, self.df, subdf, drug)
            html.create_report(onweb=False)
            if self.settings.animate:
                pb.animate(i+1)
        if self.settings.animate: print("\n")

    def create_html_main(self, onweb=False):
        """Create HTML main document (summary)"""
        self._set_sensible_df()

        if self.verbose:
            print("Creating main HTML page in directory %s" %
                (self.settings.directory))
        ReportMain(directory=self.settings.directory, verbose=self.verbose)

        buffer_ = self.settings.savefig
        self.settings.savefig = True
        html = HTMLPageMain(self, 'index.html')
        html._init_report() # created the directory
        html.create_report(onweb=onweb)
        self.settings.savefig = buffer_

    def create_html_manova(self, onweb=True):
        """Create summary table with all significant hits

        :param onweb: open browser with the created HTML page.

        """
        df = self.get_significant_set()
        page = HTMLPageMANOVA(self, df, self.company)
        page.create_report(onweb)

    def create_html_pages(self, onweb=True):
        """Create all HTML pages"""
        self.create_html_main(onweb=onweb)
        self.create_html_manova(onweb=False)
        self.create_html_drugs()
        self.create_html_features()
        self.create_html_associations()

    def onweb(self):
        from easydev import onweb
        onweb(self.settings.directory + os.sep + 'index.html')