示例#1
0
class GDSC(GDSCBase):
    """Wrapper of the :class:`~gdcstools.anova.ANOVA` class and reports to
    analyse all TCGA Tissues and companies automatically while creating summary
    HTML pages.

    First, one need to provide an unique IC50 file. Second, the DrugDecode
    file (see :class:`~gdsctools.readers.DrugDecode`) must be provided
    with the DRUG identifiers and their corresponding names. Third,
    a set of genomic feature files must be provided for each :term:`TCGA`
    tissue.


    You then create a GDSC instance::

        from gdsctools import GDSC
        gg = GDSC('IC50_v18.csv', 'DRUG_DECODE.txt',
            genomic_feature_pattern='GF*csv')

    At that stage you may want to change the settings, e.g::

        gg.settings.FDR_threshold = 20

    Then run the analysis::

        gg.analysis()

    This will launch an ANOVA analysis for each TCGA tissue + PANCAN case
    if provided. This will also create a data package for each tissue.
    The data packages are stored in ./tissue_packages directory.

    Since all private and public drugs are stored together, the next step is 
    to create data packages for each company::

        gg.create_data_packages_for_companies()

    you may select a specific one if you wish::

        gg.create_data_packages_for_companies(['AZ'])

    Finally, create some summary pages::

        gg.create_summary_pages()

    You entry point is an HTML file called **index.html**
    """
    def __init__(self,
                 ic50,
                 drug_decode,
                 genomic_feature_pattern="GF_*csv",
                 main_directory="tissue_packages",
                 verbose=True):
        """.. rubric:: Constructor

        :param ic50: an :class:`~gdsctools.readers.IC50` file.
        :param drug_decode: an :class:`~gdsctools.readers.DrugDecode` file.
        :param genomic_feature_pattern: a glob to a set of
            :class:`~gdsctools.readers.GenomicFeature` files.

        """
        super(GDSC, self).__init__(genomic_feature_pattern, verbose=verbose)
        assert isinstance(ic50, str)
        self.ic50_filename = ic50
        self.dd_filename = drug_decode
        self.main_directory = main_directory

        self.settings = ANOVASettings()
        self.settings.animate = False
        self.drug_decode = DrugDecode(drug_decode)

        print("Those settings will be used (check FDR_threshold)")
        print(self.settings)

        # figure out the cancer types:
        self.results = {}

        self.company_directory = "company_packages"

        # quick test on 15 features
        self.test = False

    def analyse(self, multicore=None):
        """Launch ANOVA analysis and creating data package for each tissue.

        :param bool onweb: By default, reports are created
            but HTML pages not shown. Set to True if you wish to open
            the HTML pages.
        :param multicore: number of cpu to use (1 by default)

        """
        self.mkdir(self.main_directory)
        # First analyse all TCGA cases + PANCAN once for all and
        # store all the results in a dictionary.
        self._analyse_all(multicore=multicore)

    def _analyse_all(self, multicore=None):
        for gf_filename in sorted(self.gf_filenames):
            tcga = gf_filename.split("_")[1].split('.')[0]
            print(purple('======================== Analysing %s data' % tcga))

            self.mkdir(self.main_directory + os.sep + tcga)
            # Computes the ANOVA
            try:
                self.ic50 = IC50(self.ic50_filename)
            except:
                print("Clustering IC50 (v18 released data ?)")
                self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)
            an = ANOVA(self.ic50, gf_filename, self.drug_decode, verbose=False)

            if self.test is True:
                an.features.df = an.features.df[an.features.df.columns[0:15]]

            self.an = an
            an.settings = ANOVASettings(**self.settings)
            an.settings.analysis_type = tcga
            an.init()  # This reset the directory

            results = an.anova_all(multicore=multicore)
            an.settings.directory = self.main_directory + os.sep + tcga
            # Store the results
            self.results[tcga] = results

            print('Analysing %s data and creating images' % tcga)
            self.report = ANOVAReport(an)
            self.report.settings.savefig = True

            self.report.create_html_pages(onweb=False)

    def create_data_packages_for_companies(self, companies=None):
        """Creates a data package for each company found in the DrugDecode file
        """
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################

        # companies must be just one name (one string) or a list of strings
        # By default, takes all companies found in DrugDecode
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        if len(companies) == 0:
            raise ValueError(
                "Could not find any companies in the DrugDecode file")

        # The main directory
        self.mkdir(self.company_directory)

        # Loop over all companies, retrieving information built
        # in analyse() method, selecting for each TCGA all information
        # for that company only (and public drugs)
        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print(
                purple("\n=========== Analysing company %s out of %s (%s)" %
                       (ii + 1, Ncomp, company)))
            self.mkdir(self.company_directory + os.sep + company)

            # Handle each TCGA case separately
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print(brown("  ------- building TCGA %s sub directory" % tcga))

                # Read the results previously computed either
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "%s/%s/OUTPUT/results.csv" % (
                        self.main_directory, tcga)
                    results_df = ANOVAResults(results_path)

                # MAke sure the results are formatted correctly
                results = ANOVAResults(results_df)

                # Get the DrugDecode information for that company only
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)

                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # Filter the results to keep only public drugs and that
                # company. Make sure this is integers
                results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in results.df.DRUG_ID
                ]

                results.df = results.df.ix[mask]

                # We read the IC50 again
                try:
                    self.ic50 = IC50(self.ic50_filename)
                except:
                    self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)

                # And create an ANOVA instance. This is not to do the analyse
                # again but to hold various information
                an = ANOVA(self.ic50,
                           gf_filename,
                           drug_decode_company,
                           verbose=False)

                def drug_to_keep(drug):
                    to_keep = drug in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga
                an.settings.analysis_type = tcga

                # Now we create the report
                self.report = ANOVAReport(an,
                                          results,
                                          drug_decode=drug_decode_company,
                                          verbose=self.verbose)
                self.report.company = company
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)
                self.report.create_html_features()
                self.report.create_html_drugs()
                self.report.create_html_associations()

    def _get_tcga(self):
        return [x.split("_")[1].split(".")[0] for x in self.gf_filenames]

    tcga = property(_get_tcga)

    def _get_companies(self):
        return [x for x in self.drug_decode.companies if x != 'Commercial']

    companies = property(_get_companies)

    def create_summary_pages(self):
        """Create summary pages

        Once the main analyis is done (:meth:`analyse`), and the company
        packages have been created (:meth:`create_data_packages_for_companies`),
        you can run this method that will creade a summary HTML page
        (index.html) for the tissue, and a similar summary HTML page for the
        tissues of each company. Finally, an HTML summary page for the 
        companies is also created.

        The final tree direcorty looks like::


            |-- index.html
            |-- company_packages
            |   |-- index.html
            |   |-- Company1
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |   |-- Company2
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |-- tissue_packages
            |   |-- index.html
            |   |-- Tissue1
            |   |-- Tissue2


        """
        # First for the main directory (tissue_packages):
        print(purple("Creating summary index.html for the tissues"))
        self._create_summary_pages(self.main_directory, verbose=False)

        # Then for each companies:
        print(purple("Creating summary index.html for each company"))
        pb = Progress(len(self.companies))
        for i, company in enumerate(self.companies):
            try:
                self._create_summary_pages(self.company_directory + os.sep +
                                           company,
                                           verbose=False,
                                           company=company)
            except Exception as err:
                print(
                    red("Issue with %s. Continue with other companies" %
                        company))
                print(err)
            pb.animate(i + 1)

        # Finally, an index towards each company
        self._create_main_index()

    def _create_main_index(self):
        # We could also add a column with number of association ?
        companies = self.companies[:]
        df = pd.DataFrame({"Company": companies})
        html_page = ReportMain(directory=".",
                               filename='index.html',
                               template_filename='main_summary.html',
                               mode="summary")
        html_table = HTMLTable(df)
        html_table.add_href('Company',
                            newtab=True,
                            url="company_packages/",
                            suffix="/index.html")
        html_page.jinja['data_table'] = html_table.to_html(
            collapse_table=False)
        html_page.jinja['analysis_domain'] = "All companies / All "
        html_page.jinja['tissue_directory'] = self.main_directory
        html_page.write()

    def _create_summary_pages(self,
                              main_directory,
                              verbose=True,
                              company=None):
        # Read all directories in tissue_packages

        directories = glob.glob(main_directory + os.sep + '*')

        summary = []
        for directory in sorted(directories):
            tcga = directory.split(os.sep)[-1]
            if tcga not in self.tcga:
                continue
            if verbose:
                print(directory, tcga)
            # number of hits
            path = directory + os.sep + 'OUTPUT' + os.sep
            try:
                hits = pd.read_csv(path + 'drugs_summary.csv', sep=',')
            except:
                summary.append([tcga] + [None] * 5)
                continue
            total_hits = hits.total.sum()

            drug_involved = hits['Unnamed: 0'].unique()

            results = ANOVAResults(path + 'results.csv')
            if len(results) > 0:
                drug_ids = results.df.DRUG_ID.unique()
            else:
                drug_ids = []

            # where to find the DRUG DECODE file. Should
            # have been copied
            path = directory + os.sep + 'INPUT' + os.sep
            drug_decode = DrugDecode(path + 'DRUG_DECODE.csv')
            info = drug_decode.get_info()

            webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE
            drug_inv_public = sum(webrelease == 'Y')
            drug_inv_prop = sum(webrelease != 'Y')

            summary.append([
                tcga, total_hits, drug_inv_prop, info['N_prop'],
                drug_inv_public, info['N_public']
            ])
        df = pd.DataFrame(summary)

        df.columns = [
            'Analysis name', 'Number of hits',
            'Number of involved proprietary compounds', 'out of',
            'Number of involved public', 'out of'
        ]

        try:
            df.sort_values(by="Number of hits", ascending=False, inplace=True)
        except:
            df.sort("Number of hits", ascending=False, inplace=True)

        output_dir = main_directory + os.sep + '..' + os.sep
        output_file = output_dir + os.sep + 'index.html'
        self.html_page = ReportMain(directory=main_directory,
                                    filename='index.html',
                                    template_filename='datapack_summary.html',
                                    mode="summary")

        # Let us use our HTMLTable to add the HTML references
        self.html_table = HTMLTable(df)
        self.html_table.add_href('Analysis name',
                                 newtab=True,
                                 url=None,
                                 suffix='/index.html')
        self.html_table.add_bgcolor('Number of hits')

        self.html_page.jinja['data_table'] = self.html_table.to_html(
            collapse_table=False)
        if company:
            self.html_page.jinja["collaborator"] = company

        self.html_page.write()

        return df
示例#2
0
class GDSC(GDSCBase):
    """Alias to ANOVA class with default settings to loop over all TCGA Tissues
    and comnpanies.

    Reads

    1. Nf Genomic feature files for different TCGA types.
    2. a unique DRUG DECODER file
    3. a unique IC50 file

    and perform the Nf analysis saving results in appropriate files.

    Then split the data for each companies.

    It also converts tissue names into TCGA names.

    .. todo:: How to get the GF files

    First, create all main analysis that include all drugs::

        gg = GDSC('IC50_v18.csv', 'DRUG_DECODE.txt',
            genomic_feature_pattern='GF*csv')
        # identifies all genomic features GF* that contains specific TCGA GF
        gg.run() # This will take hours depending on the number of drugs.


        # On v18, On an i7 core using 1 CPU this tqkes about 1 hour.30 minutes
        # THe PANCAN data set is the largest and takes about 1 hour itself

    You should now have a directory called **ALL** with about 20 directories for
    each TCGA GF file. Keep that in a safe place or you will have to restart
    the analysis

    Second, split those data just created for each specific proprietary 
    compounds. For instance::

        gg.create_data_packages_for_companies(['AZ'])
    
    or for all in one go::

        gg.create_data_packages_for_companies()


    Third, create some summary pages::

        from gdsctools.gdsc import GDSCDirectorySummary()
        gs = GDSCDirectorySummary()
        gs.create_summary_pages('ALL')
        for company in gg.companies:
            gs.create_summary_pages(company)

    The last step is fast but the whole process of analyse and image 
    creation is very long. 



    """
    def __init__(self, ic50, drug_decode,
            genomic_feature_pattern="GF_*csv",
            mode='standard'):
        super(GDSC, self).__init__(genomic_feature_pattern, verbose=True)
        self.debug = False
        self.ic50_filename = ic50
        self.dd_filename = drug_decode

        if mode == 'v18':
            self.ic50 = IC50Cluster(ic50)
        else:
            self.ic50 = IC50(ic50)

        self.drug_decode = DrugDecode(drug_decode)
        self.settings = ANOVASettings()
        self.settings.low_memory = True

        if mode == 'v18':
            self.settings.FDR_threshold = 35

        print("Those settings will be used (note that low_memory is set "
              "to True and check the value of FDR_threshold (set to 35 in v18)")
        print(self.settings)

        # figure out the cancer types:
        self.results = {}

    def run(self):
        self.mkdir('ALL')
        # First analyse all case of TCGA + PANCAN once for all and
        # store all results in a dictionary.
        self._analyse_all()

    def _analyse_all(self):
        for gf_filename in sorted(self.gf_filenames):
            tcga = gf_filename.split("_")[1].split('.')[0]
            print('================================ Analysing %s data' % tcga)

            self.mkdir('ALL' + os.sep + tcga)

            # Computes the ANOVA
            an = ANOVA(self.ic50_filename, gf_filename, self.drug_decode)
            self.an = an
            an.settings = ANOVASettings(**self.settings)
            an.init() # reset the analysis_type automatically
            results = an.anova_all()

            # Store the results
            self.results[tcga] = results

            print('Analysing %s data and creating images' % tcga)
            self.report = ANOVAReport(an, self.results[tcga])
            self.report.settings.savefig = True
            self.report.settings.directory = 'ALL/' + tcga
            self.report.settings.analysis_type = tcga
            self.report.create_html_pages()

    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                    (ii+1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                        "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [True if x in drug_decode_company.df.index else False
                        for x in drug_ids_in_results]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep
                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) 

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from 
                    # the analysis made in ALL 
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep )
                        cmd = "cp %s%s %s" % (source, filename, dest )
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga,
                                os.sep, os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep,
                                tcga, os.sep , os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest )
                        shellcmd(cmd, verbose=False)
                        pb.animate(i+1)

    def _get_tcga(self):
        return [x.split("_")[1].split(".")[0] for x in self.gf_filenames]
    tcga = property(_get_tcga)

    def _get_companies(self):
        return [x for x in self.drug_decode.companies if x != 'Commercial']
    companies = property(_get_companies)

    def create_summary_pages(self, main_directory='ALL'):
        # Read in ALL all directories

        # create directories and copy relevant files
        self.mkdir(main_directory + os.sep + 'images')
        self.mkdir(main_directory + os.sep + 'css')
        self.mkdir(main_directory + os.sep + 'js')
        from gdsctools import gdsctools_data
        for filename in ['gdsc.css', 'github-gist.css']:
            target = os.sep.join([main_directory, 'css', filename ])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['highlight.pack.js']:
            target = os.sep.join([main_directory, 'js', filename ])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['EBI_logo.png', 'sanger-logo.png']:
            target = os.sep.join([main_directory, 'images', filename ])
            if os.path.isfile(target) is False:
                dire = 'data' + os.sep + 'images'
                filename = gdsctools_data("images" + os.sep +filename)
                shutil.copy(filename, target)
        directories = glob.glob('ALL' + os.sep + '*')
        directories = [x for x in directories if os.path.isdir(x)]

        summary = []
        for directory in sorted(directories):

            tcga = directory.split(os.sep)[1]
            if tcga in ['css', 'images']:
                continue

            # number of hits
            path = directory + os.sep + 'OUTPUT' + os.sep
            try:
                hits = pd.read_csv(path + 'drugs_summary.csv', sep=',')
            except:
                summary.append([tcga] + [None] * 5)
                continue
            total_hits = hits.total.sum()

            drug_involved = get_drug_id(hits['Unnamed: 0'].unique())

            results = ANOVAResults(path + 'results.csv')
            if len(results)>0:
                drug_ids = get_drug_id(results.df.DRUG_ID.unique())
            else:
                drug_ids = []

            path = directory + os.sep + 'INPUT' + os.sep
            drug_decode = DrugDecode(path + 'DRUG_DECODE.csv')
            info = drug_decode.get_info()

            webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE
            drug_inv_public = sum(webrelease == 'Y')
            drug_inv_prop = sum(webrelease != 'Y')
            

            summary.append([tcga, total_hits,
                drug_inv_prop, info['N_prop'], 
                drug_inv_public, info['N_public']])
        df = pd.DataFrame(summary)
        df.columns = ['Analysis name', 'Number of hits', 
            'Number of involved proprietary compounds', 'out of',
            'Number of involved public', 'out of']


        # FIXME include css and images of logo
        # FIXME save in the proper directory
        output_dir = main_directory + os.sep + '..' + os.sep
        output_file = output_dir + os.sep + 'index.html'
        self.html_page = ReportMAIN(directory='ALL', filename='index.html',
                template_filename='datapack_summary.html' )

        # Let us use our HTMLTable to add the HTML references
        from gdsctools.report import HTMLTable
        self.html_table = HTMLTable(df)
        self.html_table.add_href('Analysis name', newtab=True, url=None,
                suffix='/index.html')
        #html_table.add_bgcolor('Number of hits')

        self.html_page.jinja['data_table'] = self.html_table.to_html()
        self.html_page.write()

        return df

    def load_results(self):
        """Find the files results.csv in all TCGA directories"""
        for tcga in self.tcga:
            print(tcga)
            self.results[tcga] = ANOVAResults('ALL' + os.sep + tcga + os.sep +
                    'OUTPUT' + os.sep + 'results.csv')
示例#3
0
class GDSC(GDSCBase):
    """Wrapper of the :class:`~gdcstools.anova.ANOVA` class and reports to
    analyse all TCGA Tissues and companies automatically while creating summary
    HTML pages.

    First, one need to provide an unique IC50 file. Second, the DrugDecode
    file (see :class:`~gdsctools.readers.DrugDecode`) must be provided
    with the DRUG identifiers and their corresponding names. Third,
    a set of genomic feature files must be provided for each :term:`TCGA`
    tissue.


    You then create a GDSC instance::

        from gdsctools import GDSC
        gg = GDSC('IC50_v18.csv', 'DRUG_DECODE.txt',
            genomic_feature_pattern='GF*csv')

    At that stage you may want to change the settings, e.g::

        gg.settings.FDR_threshold = 20

    Then run the analysis::

        gg.analysis()

    This will launch an ANOVA analysis for each TCGA tissue + PANCAN case
    if provided. This will also create a data package for each tissue.
    The data packages are stored in ./tissue_packages directory.

    Since all private and public drugs are stored together, the next step is 
    to create data packages for each company::

        gg.create_data_packages_for_companies()

    you may select a specific one if you wish::

        gg.create_data_packages_for_companies(['AZ'])

    Finally, create some summary pages::

        gg.create_summary_pages()

    You entry point is an HTML file called **index.html**
    """
    def __init__(self, ic50, drug_decode,
            genomic_feature_pattern="GF_*csv",
            main_directory="tissue_packages", verbose=True):
        """.. rubric:: Constructor

        :param ic50: an :class:`~gdsctools.readers.IC50` file.
        :param drug_decode: an :class:`~gdsctools.readers.DrugDecode` file.
        :param genomic_feature_pattern: a glob to a set of
            :class:`~gdsctools.readers.GenomicFeature` files.

        """
        super(GDSC, self).__init__(genomic_feature_pattern, verbose=verbose)
        assert isinstance(ic50, str)
        self.ic50_filename = ic50
        self.dd_filename = drug_decode
        self.main_directory = main_directory

        self.settings = ANOVASettings()
        self.settings.animate = False
        self.drug_decode = DrugDecode(drug_decode)

        print("Those settings will be used (check FDR_threshold)")
        print(self.settings)

        # figure out the cancer types:
        self.results = {}

        self.company_directory = "company_packages"

        # quick test on 15 features
        self.test = False

    def analyse(self, multicore=None):
        """Launch ANOVA analysis and creating data package for each tissue.

        :param bool onweb: By default, reports are created
            but HTML pages not shown. Set to True if you wish to open
            the HTML pages.
        :param multicore: number of cpu to use (1 by default)

        """
        self.mkdir(self.main_directory)
        # First analyse all TCGA cases + PANCAN once for all and
        # store all the results in a dictionary.
        self._analyse_all(multicore=multicore)

    def _analyse_all(self, multicore=None):
        for gf_filename in sorted(self.gf_filenames):
            tcga = gf_filename.split("_")[1].split('.')[0]
            print(purple('======================== Analysing %s data' % tcga))

            self.mkdir(self.main_directory + os.sep + tcga)
            # Computes the ANOVA
            try:
                self.ic50 = IC50(self.ic50_filename)
            except:
                print("Clustering IC50 (v18 released data ?)")
                self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)
            an = ANOVA(self.ic50, gf_filename, self.drug_decode,
                verbose=False)

            if self.test is True:
                an.features.df = an.features.df[an.features.df.columns[0:15]]

            self.an = an
            an.settings = ANOVASettings(**self.settings)
            an.settings.analysis_type = tcga
            an.init() # This reset the directory

            results = an.anova_all(multicore=multicore)
            an.settings.directory = self.main_directory + os.sep + tcga
            # Store the results
            self.results[tcga] = results

            print('Analysing %s data and creating images' % tcga)
            self.report = ANOVAReport(an)
            self.report.settings.savefig = True

            self.report.create_html_pages(onweb=False)

    def create_data_packages_for_companies(self, companies=None):
        """Creates a data package for each company found in the DrugDecode file
        """
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################

        # companies must be just one name (one string) or a list of strings
        # By default, takes all companies found in DrugDecode
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        if len(companies) == 0:
            raise ValueError("Could not find any companies in the DrugDecode file")

        # The main directory
        self.mkdir(self.company_directory)

        # Loop over all companies, retrieving information built
        # in analyse() method, selecting for each TCGA all information
        # for that company only (and public drugs)
        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print(purple("\n=========== Analysing company %s out of %s (%s)" %
                    (ii+1, Ncomp, company)))
            self.mkdir(self.company_directory + os.sep + company)

            # Handle each TCGA case separately
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print(brown("  ------- building TCGA %s sub directory" % tcga))

                # Read the results previously computed either
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "%s/%s/OUTPUT/results.csv" % (self.main_directory, tcga)
                    results_df = ANOVAResults(results_path)


                # MAke sure the results are formatted correctly
                results = ANOVAResults(results_df)

                # Get the DrugDecode information for that company only
                drug_decode_company = self.drug_decode.df.query(
                        "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)

                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # Filter the results to keep only public drugs and that
                # company. Make sure this is integers
                results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int)

                mask = [True if x in drug_decode_company.df.index else False
                        for x in results.df.DRUG_ID]

                results.df = results.df.ix[mask]

                # We read the IC50 again
                try:
                    self.ic50 = IC50(self.ic50_filename)
                except:
                    self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)

                # And create an ANOVA instance. This is not to do the analyse
                # again but to hold various information
                an = ANOVA(self.ic50, gf_filename, drug_decode_company,
                    verbose=False)

                def drug_to_keep(drug):
                    to_keep = drug in drug_decode_company.df.index
                    return to_keep
                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga
                an.settings.analysis_type = tcga

                # Now we create the report
                self.report = ANOVAReport(an, results,
                        drug_decode=drug_decode_company,
                        verbose=self.verbose)
                self.report.company = company
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)
                self.report.create_html_features()
                self.report.create_html_drugs()
                self.report.create_html_associations()

    def _get_tcga(self):
        return [x.split("_")[1].split(".")[0] for x in self.gf_filenames]
    tcga = property(_get_tcga)

    def _get_companies(self):
        return [x for x in self.drug_decode.companies if x != 'Commercial']
    companies = property(_get_companies)

    def create_summary_pages(self):
        """Create summary pages

        Once the main analyis is done (:meth:`analyse`), and the company
        packages have been created (:meth:`create_data_packages_for_companies`),
        you can run this method that will creade a summary HTML page
        (index.html) for the tissue, and a similar summary HTML page for the
        tissues of each company. Finally, an HTML summary page for the 
        companies is also created.

        The final tree direcorty looks like::


            |-- index.html
            |-- company_packages
            |   |-- index.html
            |   |-- Company1
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |   |-- Company2
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |-- tissue_packages
            |   |-- index.html
            |   |-- Tissue1
            |   |-- Tissue2


        """
        # First for the main directory (tissue_packages):
        print(purple("Creating summary index.html for the tissues"))
        self._create_summary_pages(self.main_directory, verbose=False)

        # Then for each companies:
        print(purple("Creating summary index.html for each company"))
        pb = Progress(len(self.companies))
        for i, company in enumerate(self.companies):
            try:
                self._create_summary_pages(self.company_directory + os.sep +
                    company, verbose=False, company=company)
            except Exception as err:
                print(red("Issue with %s. Continue with other companies" % company))
                print(err)
            pb.animate(i+1)

        # Finally, an index towards each company
        self._create_main_index()

    def _create_main_index(self):
        # We could also add a column with number of association ?
        companies = self.companies[:]
        df = pd.DataFrame({"Company": companies})
        html_page = ReportMain(directory=".",
                filename='index.html',
                template_filename='main_summary.html',
                mode="summary")
        html_table = HTMLTable(df)
        html_table.add_href('Company', newtab=True, url="company_packages/",
                suffix="/index.html")
        html_page.jinja['data_table'] =  html_table.to_html(collapse_table=False)
        html_page.jinja['analysis_domain'] =  "All companies / All "
        html_page.jinja['tissue_directory'] = self.main_directory
        html_page.write()

    def _create_summary_pages(self, main_directory, verbose=True,
            company=None):
        # Read all directories in tissue_packages

        directories = glob.glob(main_directory + os.sep + '*')

        summary = []
        for directory in sorted(directories):
            tcga = directory.split(os.sep)[-1]
            if tcga not in self.tcga:
                continue
            if verbose:
                print(directory, tcga)
            # number of hits
            path = directory + os.sep + 'OUTPUT' + os.sep
            try:
                hits = pd.read_csv(path + 'drugs_summary.csv', sep=',')
            except:
                summary.append([tcga] + [None] * 5)
                continue
            total_hits = hits.total.sum()

            drug_involved = hits['Unnamed: 0'].unique()

            results = ANOVAResults(path + 'results.csv')
            if len(results) > 0:
                drug_ids = results.df.DRUG_ID.unique()
            else:
                drug_ids = []

            # where to find the DRUG DECODE file. Should
            # have been copied
            path = directory + os.sep + 'INPUT' + os.sep
            drug_decode = DrugDecode(path + 'DRUG_DECODE.csv')
            info = drug_decode.get_info()

            webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE
            drug_inv_public = sum(webrelease == 'Y')
            drug_inv_prop = sum(webrelease != 'Y')

            summary.append([tcga, total_hits,
                drug_inv_prop, info['N_prop'],
                drug_inv_public, info['N_public']])
        df = pd.DataFrame(summary)

        df.columns = ['Analysis name', 'Number of hits',
            'Number of involved proprietary compounds', 'out of',
            'Number of involved public', 'out of']

        try:
            df.sort_values(by="Number of hits", ascending=False, inplace=True)
        except:
            df.sort("Number of hits", ascending=False, inplace=True)

        output_dir = main_directory + os.sep + '..' + os.sep
        output_file = output_dir + os.sep + 'index.html'
        self.html_page = ReportMain(directory=main_directory,
                filename='index.html',
                template_filename='datapack_summary.html',
                mode="summary")

        # Let us use our HTMLTable to add the HTML references
        self.html_table = HTMLTable(df)
        self.html_table.add_href('Analysis name', newtab=True, url=None,
                suffix='/index.html')
        self.html_table.add_bgcolor('Number of hits')

        self.html_page.jinja['data_table'] =  self.html_table.to_html(
                collapse_table=False)
        if company:
            self.html_page.jinja["collaborator"] = company

        self.html_page.write()

        return df
示例#4
0
class GDSC(GDSCBase):
    """Alias to ANOVA class with default settings to loop over all TCGA Tissues
    and comnpanies.

    Reads

    1. Nf Genomic feature files for different TCGA types.
    2. a unique DRUG DECODER file
    3. a unique IC50 file

    and perform the Nf analysis saving results in appropriate files.

    Then split the data for each companies.

    It also converts tissue names into TCGA names.

    .. todo:: How to get the GF files

    First, create all main analysis that include all drugs::

        gg = GDSC('IC50_v18.csv', 'DRUG_DECODE.txt',
            genomic_feature_pattern='GF*csv')
        # identifies all genomic features GF* that contains specific TCGA GF
        gg.run() # This will take hours depending on the number of drugs.


        # On v18, On an i7 core using 1 CPU this tqkes about 1 hour.30 minutes
        # THe PANCAN data set is the largest and takes about 1 hour itself

    You should now have a directory called **ALL** with about 20 directories for
    each TCGA GF file. Keep that in a safe place or you will have to restart
    the analysis

    Second, split those data just created for each specific proprietary 
    compounds. For instance::

        gg.create_data_packages_for_companies(['AZ'])
    
    or for all in one go::

        gg.create_data_packages_for_companies()


    Third, create some summary pages::

        from gdsctools.gdsc import GDSCDirectorySummary()
        gs = GDSCDirectorySummary()
        gs.create_summary_pages('ALL')
        for company in gg.companies:
            gs.create_summary_pages(company)

    The last step is fast but the whole process of analyse and image 
    creation is very long. 



    """
    def __init__(self,
                 ic50,
                 drug_decode,
                 genomic_feature_pattern="GF_*csv",
                 mode='standard'):
        super(GDSC, self).__init__(genomic_feature_pattern, verbose=True)
        self.debug = False
        self.ic50_filename = ic50
        self.dd_filename = drug_decode

        if mode == 'v18':
            self.ic50 = IC50Cluster(ic50)
        else:
            self.ic50 = IC50(ic50)

        self.drug_decode = DrugDecode(drug_decode)
        self.settings = ANOVASettings()
        self.settings.low_memory = True

        if mode == 'v18':
            self.settings.FDR_threshold = 35

        print(
            "Those settings will be used (note that low_memory is set "
            "to True and check the value of FDR_threshold (set to 35 in v18)")
        print(self.settings)

        # figure out the cancer types:
        self.results = {}

    def run(self):
        self.mkdir('ALL')
        # First analyse all case of TCGA + PANCAN once for all and
        # store all results in a dictionary.
        self._analyse_all()

    def _analyse_all(self):
        for gf_filename in sorted(self.gf_filenames):
            tcga = gf_filename.split("_")[1].split('.')[0]
            print('================================ Analysing %s data' % tcga)

            self.mkdir('ALL' + os.sep + tcga)

            # Computes the ANOVA
            an = ANOVA(self.ic50_filename, gf_filename, self.drug_decode)
            self.an = an
            an.settings = ANOVASettings(**self.settings)
            an.init()  # reset the analysis_type automatically
            results = an.anova_all()

            # Store the results
            self.results[tcga] = results

            print('Analysing %s data and creating images' % tcga)
            self.report = ANOVAReport(an, self.results[tcga])
            self.report.settings.savefig = True
            self.report.settings.directory = 'ALL/' + tcga
            self.report.settings.analysis_type = tcga
            self.report.create_html_pages()

    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                  (ii + 1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in drug_ids_in_results
                ]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename,
                           drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from
                    # the analysis made in ALL
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep,
                                                        os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep, tcga,
                                                     os.sep, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        pb.animate(i + 1)

    def _get_tcga(self):
        return [x.split("_")[1].split(".")[0] for x in self.gf_filenames]

    tcga = property(_get_tcga)

    def _get_companies(self):
        return [x for x in self.drug_decode.companies if x != 'Commercial']

    companies = property(_get_companies)

    def create_summary_pages(self, main_directory='ALL'):
        # Read in ALL all directories

        # create directories and copy relevant files
        self.mkdir(main_directory + os.sep + 'images')
        self.mkdir(main_directory + os.sep + 'css')
        self.mkdir(main_directory + os.sep + 'js')
        from gdsctools import gdsctools_data
        for filename in ['gdsc.css', 'github-gist.css']:
            target = os.sep.join([main_directory, 'css', filename])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['highlight.pack.js']:
            target = os.sep.join([main_directory, 'js', filename])
            if os.path.isfile(target) is False:
                filename = gdsctools_data(filename)
                shutil.copy(filename, target)
        for filename in ['EBI_logo.png', 'sanger-logo.png']:
            target = os.sep.join([main_directory, 'images', filename])
            if os.path.isfile(target) is False:
                dire = 'data' + os.sep + 'images'
                filename = gdsctools_data("images" + os.sep + filename)
                shutil.copy(filename, target)
        directories = glob.glob('ALL' + os.sep + '*')
        directories = [x for x in directories if os.path.isdir(x)]

        summary = []
        for directory in sorted(directories):

            tcga = directory.split(os.sep)[1]
            if tcga in ['css', 'images']:
                continue

            # number of hits
            path = directory + os.sep + 'OUTPUT' + os.sep
            try:
                hits = pd.read_csv(path + 'drugs_summary.csv', sep=',')
            except:
                summary.append([tcga] + [None] * 5)
                continue
            total_hits = hits.total.sum()

            drug_involved = get_drug_id(hits['Unnamed: 0'].unique())

            results = ANOVAResults(path + 'results.csv')
            if len(results) > 0:
                drug_ids = get_drug_id(results.df.DRUG_ID.unique())
            else:
                drug_ids = []

            path = directory + os.sep + 'INPUT' + os.sep
            drug_decode = DrugDecode(path + 'DRUG_DECODE.csv')
            info = drug_decode.get_info()

            webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE
            drug_inv_public = sum(webrelease == 'Y')
            drug_inv_prop = sum(webrelease != 'Y')

            summary.append([
                tcga, total_hits, drug_inv_prop, info['N_prop'],
                drug_inv_public, info['N_public']
            ])
        df = pd.DataFrame(summary)
        df.columns = [
            'Analysis name', 'Number of hits',
            'Number of involved proprietary compounds', 'out of',
            'Number of involved public', 'out of'
        ]

        # FIXME include css and images of logo
        # FIXME save in the proper directory
        output_dir = main_directory + os.sep + '..' + os.sep
        output_file = output_dir + os.sep + 'index.html'
        self.html_page = ReportMAIN(directory='ALL',
                                    filename='index.html',
                                    template_filename='datapack_summary.html')

        # Let us use our HTMLTable to add the HTML references
        from gdsctools.report import HTMLTable
        self.html_table = HTMLTable(df)
        self.html_table.add_href('Analysis name',
                                 newtab=True,
                                 url=None,
                                 suffix='/index.html')
        #html_table.add_bgcolor('Number of hits')

        self.html_page.jinja['data_table'] = self.html_table.to_html()
        self.html_page.write()

        return df

    def load_results(self):
        """Find the files results.csv in all TCGA directories"""
        for tcga in self.tcga:
            print(tcga)
            self.results[tcga] = ANOVAResults('ALL' + os.sep + tcga + os.sep +
                                              'OUTPUT' + os.sep +
                                              'results.csv')