class GDSC(GDSCBase): """Wrapper of the :class:`~gdcstools.anova.ANOVA` class and reports to analyse all TCGA Tissues and companies automatically while creating summary HTML pages. First, one need to provide an unique IC50 file. Second, the DrugDecode file (see :class:`~gdsctools.readers.DrugDecode`) must be provided with the DRUG identifiers and their corresponding names. Third, a set of genomic feature files must be provided for each :term:`TCGA` tissue. You then create a GDSC instance:: from gdsctools import GDSC gg = GDSC('IC50_v18.csv', 'DRUG_DECODE.txt', genomic_feature_pattern='GF*csv') At that stage you may want to change the settings, e.g:: gg.settings.FDR_threshold = 20 Then run the analysis:: gg.analysis() This will launch an ANOVA analysis for each TCGA tissue + PANCAN case if provided. This will also create a data package for each tissue. The data packages are stored in ./tissue_packages directory. Since all private and public drugs are stored together, the next step is to create data packages for each company:: gg.create_data_packages_for_companies() you may select a specific one if you wish:: gg.create_data_packages_for_companies(['AZ']) Finally, create some summary pages:: gg.create_summary_pages() You entry point is an HTML file called **index.html** """ def __init__(self, ic50, drug_decode, genomic_feature_pattern="GF_*csv", main_directory="tissue_packages", verbose=True): """.. rubric:: Constructor :param ic50: an :class:`~gdsctools.readers.IC50` file. :param drug_decode: an :class:`~gdsctools.readers.DrugDecode` file. :param genomic_feature_pattern: a glob to a set of :class:`~gdsctools.readers.GenomicFeature` files. """ super(GDSC, self).__init__(genomic_feature_pattern, verbose=verbose) assert isinstance(ic50, str) self.ic50_filename = ic50 self.dd_filename = drug_decode self.main_directory = main_directory self.settings = ANOVASettings() self.settings.animate = False self.drug_decode = DrugDecode(drug_decode) print("Those settings will be used (check FDR_threshold)") print(self.settings) # figure out the cancer types: self.results = {} self.company_directory = "company_packages" # quick test on 15 features self.test = False def analyse(self, multicore=None): """Launch ANOVA analysis and creating data package for each tissue. :param bool onweb: By default, reports are created but HTML pages not shown. Set to True if you wish to open the HTML pages. :param multicore: number of cpu to use (1 by default) """ self.mkdir(self.main_directory) # First analyse all TCGA cases + PANCAN once for all and # store all the results in a dictionary. self._analyse_all(multicore=multicore) def _analyse_all(self, multicore=None): for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(purple('======================== Analysing %s data' % tcga)) self.mkdir(self.main_directory + os.sep + tcga) # Computes the ANOVA try: self.ic50 = IC50(self.ic50_filename) except: print("Clustering IC50 (v18 released data ?)") self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) an = ANOVA(self.ic50, gf_filename, self.drug_decode, verbose=False) if self.test is True: an.features.df = an.features.df[an.features.df.columns[0:15]] self.an = an an.settings = ANOVASettings(**self.settings) an.settings.analysis_type = tcga an.init() # This reset the directory results = an.anova_all(multicore=multicore) an.settings.directory = self.main_directory + os.sep + tcga # Store the results self.results[tcga] = results print('Analysing %s data and creating images' % tcga) self.report = ANOVAReport(an) self.report.settings.savefig = True self.report.create_html_pages(onweb=False) def create_data_packages_for_companies(self, companies=None): """Creates a data package for each company found in the DrugDecode file """ ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## # companies must be just one name (one string) or a list of strings # By default, takes all companies found in DrugDecode if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies if len(companies) == 0: raise ValueError( "Could not find any companies in the DrugDecode file") # The main directory self.mkdir(self.company_directory) # Loop over all companies, retrieving information built # in analyse() method, selecting for each TCGA all information # for that company only (and public drugs) Ncomp = len(companies) for ii, company in enumerate(companies): print( purple("\n=========== Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company))) self.mkdir(self.company_directory + os.sep + company) # Handle each TCGA case separately for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(brown(" ------- building TCGA %s sub directory" % tcga)) # Read the results previously computed either try: results_df = self.results[tcga].df.copy() except: results_path = "%s/%s/OUTPUT/results.csv" % ( self.main_directory, tcga) results_df = ANOVAResults(results_path) # MAke sure the results are formatted correctly results = ANOVAResults(results_df) # Get the DrugDecode information for that company only drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # Filter the results to keep only public drugs and that # company. Make sure this is integers results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int) mask = [ True if x in drug_decode_company.df.index else False for x in results.df.DRUG_ID ] results.df = results.df.ix[mask] # We read the IC50 again try: self.ic50 = IC50(self.ic50_filename) except: self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) # And create an ANOVA instance. This is not to do the analyse # again but to hold various information an = ANOVA(self.ic50, gf_filename, drug_decode_company, verbose=False) def drug_to_keep(drug): to_keep = drug in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga an.settings.analysis_type = tcga # Now we create the report self.report = ANOVAReport(an, results, drug_decode=drug_decode_company, verbose=self.verbose) self.report.company = company self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) self.report.create_html_features() self.report.create_html_drugs() self.report.create_html_associations() def _get_tcga(self): return [x.split("_")[1].split(".")[0] for x in self.gf_filenames] tcga = property(_get_tcga) def _get_companies(self): return [x for x in self.drug_decode.companies if x != 'Commercial'] companies = property(_get_companies) def create_summary_pages(self): """Create summary pages Once the main analyis is done (:meth:`analyse`), and the company packages have been created (:meth:`create_data_packages_for_companies`), you can run this method that will creade a summary HTML page (index.html) for the tissue, and a similar summary HTML page for the tissues of each company. Finally, an HTML summary page for the companies is also created. The final tree direcorty looks like:: |-- index.html |-- company_packages | |-- index.html | |-- Company1 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html | |-- Company2 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html |-- tissue_packages | |-- index.html | |-- Tissue1 | |-- Tissue2 """ # First for the main directory (tissue_packages): print(purple("Creating summary index.html for the tissues")) self._create_summary_pages(self.main_directory, verbose=False) # Then for each companies: print(purple("Creating summary index.html for each company")) pb = Progress(len(self.companies)) for i, company in enumerate(self.companies): try: self._create_summary_pages(self.company_directory + os.sep + company, verbose=False, company=company) except Exception as err: print( red("Issue with %s. Continue with other companies" % company)) print(err) pb.animate(i + 1) # Finally, an index towards each company self._create_main_index() def _create_main_index(self): # We could also add a column with number of association ? companies = self.companies[:] df = pd.DataFrame({"Company": companies}) html_page = ReportMain(directory=".", filename='index.html', template_filename='main_summary.html', mode="summary") html_table = HTMLTable(df) html_table.add_href('Company', newtab=True, url="company_packages/", suffix="/index.html") html_page.jinja['data_table'] = html_table.to_html( collapse_table=False) html_page.jinja['analysis_domain'] = "All companies / All " html_page.jinja['tissue_directory'] = self.main_directory html_page.write() def _create_summary_pages(self, main_directory, verbose=True, company=None): # Read all directories in tissue_packages directories = glob.glob(main_directory + os.sep + '*') summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[-1] if tcga not in self.tcga: continue if verbose: print(directory, tcga) # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = hits['Unnamed: 0'].unique() results = ANOVAResults(path + 'results.csv') if len(results) > 0: drug_ids = results.df.DRUG_ID.unique() else: drug_ids = [] # where to find the DRUG DECODE file. Should # have been copied path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([ tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public'] ]) df = pd.DataFrame(summary) df.columns = [ 'Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of' ] try: df.sort_values(by="Number of hits", ascending=False, inplace=True) except: df.sort("Number of hits", ascending=False, inplace=True) output_dir = main_directory + os.sep + '..' + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMain(directory=main_directory, filename='index.html', template_filename='datapack_summary.html', mode="summary") # Let us use our HTMLTable to add the HTML references self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') self.html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html( collapse_table=False) if company: self.html_page.jinja["collaborator"] = company self.html_page.write() return df
class GDSC(GDSCBase): """Alias to ANOVA class with default settings to loop over all TCGA Tissues and comnpanies. Reads 1. Nf Genomic feature files for different TCGA types. 2. a unique DRUG DECODER file 3. a unique IC50 file and perform the Nf analysis saving results in appropriate files. Then split the data for each companies. It also converts tissue names into TCGA names. .. todo:: How to get the GF files First, create all main analysis that include all drugs:: gg = GDSC('IC50_v18.csv', 'DRUG_DECODE.txt', genomic_feature_pattern='GF*csv') # identifies all genomic features GF* that contains specific TCGA GF gg.run() # This will take hours depending on the number of drugs. # On v18, On an i7 core using 1 CPU this tqkes about 1 hour.30 minutes # THe PANCAN data set is the largest and takes about 1 hour itself You should now have a directory called **ALL** with about 20 directories for each TCGA GF file. Keep that in a safe place or you will have to restart the analysis Second, split those data just created for each specific proprietary compounds. For instance:: gg.create_data_packages_for_companies(['AZ']) or for all in one go:: gg.create_data_packages_for_companies() Third, create some summary pages:: from gdsctools.gdsc import GDSCDirectorySummary() gs = GDSCDirectorySummary() gs.create_summary_pages('ALL') for company in gg.companies: gs.create_summary_pages(company) The last step is fast but the whole process of analyse and image creation is very long. """ def __init__(self, ic50, drug_decode, genomic_feature_pattern="GF_*csv", mode='standard'): super(GDSC, self).__init__(genomic_feature_pattern, verbose=True) self.debug = False self.ic50_filename = ic50 self.dd_filename = drug_decode if mode == 'v18': self.ic50 = IC50Cluster(ic50) else: self.ic50 = IC50(ic50) self.drug_decode = DrugDecode(drug_decode) self.settings = ANOVASettings() self.settings.low_memory = True if mode == 'v18': self.settings.FDR_threshold = 35 print("Those settings will be used (note that low_memory is set " "to True and check the value of FDR_threshold (set to 35 in v18)") print(self.settings) # figure out the cancer types: self.results = {} def run(self): self.mkdir('ALL') # First analyse all case of TCGA + PANCAN once for all and # store all results in a dictionary. self._analyse_all() def _analyse_all(self): for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print('================================ Analysing %s data' % tcga) self.mkdir('ALL' + os.sep + tcga) # Computes the ANOVA an = ANOVA(self.ic50_filename, gf_filename, self.drug_decode) self.an = an an.settings = ANOVASettings(**self.settings) an.init() # reset the analysis_type automatically results = an.anova_all() # Store the results self.results[tcga] = results print('Analysing %s data and creating images' % tcga) self.report = ANOVAReport(an, self.results[tcga]) self.report.settings.savefig = True self.report.settings.directory = 'ALL/' + tcga self.report.settings.analysis_type = tcga self.report.create_html_pages() def create_data_packages_for_companies(self, companies=None): ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies Ncomp = len(companies) for ii, company in enumerate(companies): print("\n\n========= Analysing company %s out of %s (%s)" % (ii+1, Ncomp, company)) self.mkdir(company) for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print("---------------- for TCGA %s" % tcga) # Read the results previously computed try: results_df = self.results[tcga].df.copy() except: results_path = "ALL/%s/OUTPUT/results.csv" % tcga print("Downloading results from %s" % results_path) results_df = ANOVAResults(results_path) results = ANOVAResults(results_df) # Get a DrugDecode for that company drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # filter results using the new drug decode drug_ids_in_results = get_drug_id(results.df.DRUG_ID) mask = [True if x in drug_decode_company.df.index else False for x in drug_ids_in_results] results.df = results.df.ix[mask] # Just to create an instance with the subset of drug_decode # and correct settings. This is also used to store # the entire input data set. So, we must remove all drugs # not relevant for the analysis of this company an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company) def drug_to_keep(drug): to_keep = get_drug_id(drug) in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = company + os.sep + tcga an.settings.analysis_type = tcga self.report = ANOVAReport(an, results) self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) if self.debug is False: self.report.create_html_features() self.report.create_html_associations() # For now, we just copy all DRUG images from # the analysis made in ALL from easydev import shellcmd, Progress print("\nCopying drug files") drug_ids = results.df.DRUG_ID.unique() pb = Progress(len(drug_ids)) for i, drug_id in enumerate(drug_ids): # copy the HTML filename = "%s.html" % drug_id source = "ALL%s%s%s" % (os.sep, tcga, os.sep) dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep ) cmd = "cp %s%s %s" % (source, filename, dest ) shellcmd(cmd, verbose=False) #copy the images filename = "volcano_%s.*" % drug_id source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep, os.sep) dest = "%s%s%s%simages%s" % (company, os.sep, tcga, os.sep , os.sep) cmd = "cp %s%s %s" % (source, filename, dest ) shellcmd(cmd, verbose=False) pb.animate(i+1) def _get_tcga(self): return [x.split("_")[1].split(".")[0] for x in self.gf_filenames] tcga = property(_get_tcga) def _get_companies(self): return [x for x in self.drug_decode.companies if x != 'Commercial'] companies = property(_get_companies) def create_summary_pages(self, main_directory='ALL'): # Read in ALL all directories # create directories and copy relevant files self.mkdir(main_directory + os.sep + 'images') self.mkdir(main_directory + os.sep + 'css') self.mkdir(main_directory + os.sep + 'js') from gdsctools import gdsctools_data for filename in ['gdsc.css', 'github-gist.css']: target = os.sep.join([main_directory, 'css', filename ]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['highlight.pack.js']: target = os.sep.join([main_directory, 'js', filename ]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['EBI_logo.png', 'sanger-logo.png']: target = os.sep.join([main_directory, 'images', filename ]) if os.path.isfile(target) is False: dire = 'data' + os.sep + 'images' filename = gdsctools_data("images" + os.sep +filename) shutil.copy(filename, target) directories = glob.glob('ALL' + os.sep + '*') directories = [x for x in directories if os.path.isdir(x)] summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[1] if tcga in ['css', 'images']: continue # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = get_drug_id(hits['Unnamed: 0'].unique()) results = ANOVAResults(path + 'results.csv') if len(results)>0: drug_ids = get_drug_id(results.df.DRUG_ID.unique()) else: drug_ids = [] path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public']]) df = pd.DataFrame(summary) df.columns = ['Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of'] # FIXME include css and images of logo # FIXME save in the proper directory output_dir = main_directory + os.sep + '..' + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMAIN(directory='ALL', filename='index.html', template_filename='datapack_summary.html' ) # Let us use our HTMLTable to add the HTML references from gdsctools.report import HTMLTable self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') #html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html() self.html_page.write() return df def load_results(self): """Find the files results.csv in all TCGA directories""" for tcga in self.tcga: print(tcga) self.results[tcga] = ANOVAResults('ALL' + os.sep + tcga + os.sep + 'OUTPUT' + os.sep + 'results.csv')
class GDSC(GDSCBase): """Wrapper of the :class:`~gdcstools.anova.ANOVA` class and reports to analyse all TCGA Tissues and companies automatically while creating summary HTML pages. First, one need to provide an unique IC50 file. Second, the DrugDecode file (see :class:`~gdsctools.readers.DrugDecode`) must be provided with the DRUG identifiers and their corresponding names. Third, a set of genomic feature files must be provided for each :term:`TCGA` tissue. You then create a GDSC instance:: from gdsctools import GDSC gg = GDSC('IC50_v18.csv', 'DRUG_DECODE.txt', genomic_feature_pattern='GF*csv') At that stage you may want to change the settings, e.g:: gg.settings.FDR_threshold = 20 Then run the analysis:: gg.analysis() This will launch an ANOVA analysis for each TCGA tissue + PANCAN case if provided. This will also create a data package for each tissue. The data packages are stored in ./tissue_packages directory. Since all private and public drugs are stored together, the next step is to create data packages for each company:: gg.create_data_packages_for_companies() you may select a specific one if you wish:: gg.create_data_packages_for_companies(['AZ']) Finally, create some summary pages:: gg.create_summary_pages() You entry point is an HTML file called **index.html** """ def __init__(self, ic50, drug_decode, genomic_feature_pattern="GF_*csv", main_directory="tissue_packages", verbose=True): """.. rubric:: Constructor :param ic50: an :class:`~gdsctools.readers.IC50` file. :param drug_decode: an :class:`~gdsctools.readers.DrugDecode` file. :param genomic_feature_pattern: a glob to a set of :class:`~gdsctools.readers.GenomicFeature` files. """ super(GDSC, self).__init__(genomic_feature_pattern, verbose=verbose) assert isinstance(ic50, str) self.ic50_filename = ic50 self.dd_filename = drug_decode self.main_directory = main_directory self.settings = ANOVASettings() self.settings.animate = False self.drug_decode = DrugDecode(drug_decode) print("Those settings will be used (check FDR_threshold)") print(self.settings) # figure out the cancer types: self.results = {} self.company_directory = "company_packages" # quick test on 15 features self.test = False def analyse(self, multicore=None): """Launch ANOVA analysis and creating data package for each tissue. :param bool onweb: By default, reports are created but HTML pages not shown. Set to True if you wish to open the HTML pages. :param multicore: number of cpu to use (1 by default) """ self.mkdir(self.main_directory) # First analyse all TCGA cases + PANCAN once for all and # store all the results in a dictionary. self._analyse_all(multicore=multicore) def _analyse_all(self, multicore=None): for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(purple('======================== Analysing %s data' % tcga)) self.mkdir(self.main_directory + os.sep + tcga) # Computes the ANOVA try: self.ic50 = IC50(self.ic50_filename) except: print("Clustering IC50 (v18 released data ?)") self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) an = ANOVA(self.ic50, gf_filename, self.drug_decode, verbose=False) if self.test is True: an.features.df = an.features.df[an.features.df.columns[0:15]] self.an = an an.settings = ANOVASettings(**self.settings) an.settings.analysis_type = tcga an.init() # This reset the directory results = an.anova_all(multicore=multicore) an.settings.directory = self.main_directory + os.sep + tcga # Store the results self.results[tcga] = results print('Analysing %s data and creating images' % tcga) self.report = ANOVAReport(an) self.report.settings.savefig = True self.report.create_html_pages(onweb=False) def create_data_packages_for_companies(self, companies=None): """Creates a data package for each company found in the DrugDecode file """ ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## # companies must be just one name (one string) or a list of strings # By default, takes all companies found in DrugDecode if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies if len(companies) == 0: raise ValueError("Could not find any companies in the DrugDecode file") # The main directory self.mkdir(self.company_directory) # Loop over all companies, retrieving information built # in analyse() method, selecting for each TCGA all information # for that company only (and public drugs) Ncomp = len(companies) for ii, company in enumerate(companies): print(purple("\n=========== Analysing company %s out of %s (%s)" % (ii+1, Ncomp, company))) self.mkdir(self.company_directory + os.sep + company) # Handle each TCGA case separately for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(brown(" ------- building TCGA %s sub directory" % tcga)) # Read the results previously computed either try: results_df = self.results[tcga].df.copy() except: results_path = "%s/%s/OUTPUT/results.csv" % (self.main_directory, tcga) results_df = ANOVAResults(results_path) # MAke sure the results are formatted correctly results = ANOVAResults(results_df) # Get the DrugDecode information for that company only drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # Filter the results to keep only public drugs and that # company. Make sure this is integers results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int) mask = [True if x in drug_decode_company.df.index else False for x in results.df.DRUG_ID] results.df = results.df.ix[mask] # We read the IC50 again try: self.ic50 = IC50(self.ic50_filename) except: self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) # And create an ANOVA instance. This is not to do the analyse # again but to hold various information an = ANOVA(self.ic50, gf_filename, drug_decode_company, verbose=False) def drug_to_keep(drug): to_keep = drug in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga an.settings.analysis_type = tcga # Now we create the report self.report = ANOVAReport(an, results, drug_decode=drug_decode_company, verbose=self.verbose) self.report.company = company self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) self.report.create_html_features() self.report.create_html_drugs() self.report.create_html_associations() def _get_tcga(self): return [x.split("_")[1].split(".")[0] for x in self.gf_filenames] tcga = property(_get_tcga) def _get_companies(self): return [x for x in self.drug_decode.companies if x != 'Commercial'] companies = property(_get_companies) def create_summary_pages(self): """Create summary pages Once the main analyis is done (:meth:`analyse`), and the company packages have been created (:meth:`create_data_packages_for_companies`), you can run this method that will creade a summary HTML page (index.html) for the tissue, and a similar summary HTML page for the tissues of each company. Finally, an HTML summary page for the companies is also created. The final tree direcorty looks like:: |-- index.html |-- company_packages | |-- index.html | |-- Company1 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html | |-- Company2 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html |-- tissue_packages | |-- index.html | |-- Tissue1 | |-- Tissue2 """ # First for the main directory (tissue_packages): print(purple("Creating summary index.html for the tissues")) self._create_summary_pages(self.main_directory, verbose=False) # Then for each companies: print(purple("Creating summary index.html for each company")) pb = Progress(len(self.companies)) for i, company in enumerate(self.companies): try: self._create_summary_pages(self.company_directory + os.sep + company, verbose=False, company=company) except Exception as err: print(red("Issue with %s. Continue with other companies" % company)) print(err) pb.animate(i+1) # Finally, an index towards each company self._create_main_index() def _create_main_index(self): # We could also add a column with number of association ? companies = self.companies[:] df = pd.DataFrame({"Company": companies}) html_page = ReportMain(directory=".", filename='index.html', template_filename='main_summary.html', mode="summary") html_table = HTMLTable(df) html_table.add_href('Company', newtab=True, url="company_packages/", suffix="/index.html") html_page.jinja['data_table'] = html_table.to_html(collapse_table=False) html_page.jinja['analysis_domain'] = "All companies / All " html_page.jinja['tissue_directory'] = self.main_directory html_page.write() def _create_summary_pages(self, main_directory, verbose=True, company=None): # Read all directories in tissue_packages directories = glob.glob(main_directory + os.sep + '*') summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[-1] if tcga not in self.tcga: continue if verbose: print(directory, tcga) # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = hits['Unnamed: 0'].unique() results = ANOVAResults(path + 'results.csv') if len(results) > 0: drug_ids = results.df.DRUG_ID.unique() else: drug_ids = [] # where to find the DRUG DECODE file. Should # have been copied path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public']]) df = pd.DataFrame(summary) df.columns = ['Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of'] try: df.sort_values(by="Number of hits", ascending=False, inplace=True) except: df.sort("Number of hits", ascending=False, inplace=True) output_dir = main_directory + os.sep + '..' + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMain(directory=main_directory, filename='index.html', template_filename='datapack_summary.html', mode="summary") # Let us use our HTMLTable to add the HTML references self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') self.html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html( collapse_table=False) if company: self.html_page.jinja["collaborator"] = company self.html_page.write() return df
class GDSC(GDSCBase): """Alias to ANOVA class with default settings to loop over all TCGA Tissues and comnpanies. Reads 1. Nf Genomic feature files for different TCGA types. 2. a unique DRUG DECODER file 3. a unique IC50 file and perform the Nf analysis saving results in appropriate files. Then split the data for each companies. It also converts tissue names into TCGA names. .. todo:: How to get the GF files First, create all main analysis that include all drugs:: gg = GDSC('IC50_v18.csv', 'DRUG_DECODE.txt', genomic_feature_pattern='GF*csv') # identifies all genomic features GF* that contains specific TCGA GF gg.run() # This will take hours depending on the number of drugs. # On v18, On an i7 core using 1 CPU this tqkes about 1 hour.30 minutes # THe PANCAN data set is the largest and takes about 1 hour itself You should now have a directory called **ALL** with about 20 directories for each TCGA GF file. Keep that in a safe place or you will have to restart the analysis Second, split those data just created for each specific proprietary compounds. For instance:: gg.create_data_packages_for_companies(['AZ']) or for all in one go:: gg.create_data_packages_for_companies() Third, create some summary pages:: from gdsctools.gdsc import GDSCDirectorySummary() gs = GDSCDirectorySummary() gs.create_summary_pages('ALL') for company in gg.companies: gs.create_summary_pages(company) The last step is fast but the whole process of analyse and image creation is very long. """ def __init__(self, ic50, drug_decode, genomic_feature_pattern="GF_*csv", mode='standard'): super(GDSC, self).__init__(genomic_feature_pattern, verbose=True) self.debug = False self.ic50_filename = ic50 self.dd_filename = drug_decode if mode == 'v18': self.ic50 = IC50Cluster(ic50) else: self.ic50 = IC50(ic50) self.drug_decode = DrugDecode(drug_decode) self.settings = ANOVASettings() self.settings.low_memory = True if mode == 'v18': self.settings.FDR_threshold = 35 print( "Those settings will be used (note that low_memory is set " "to True and check the value of FDR_threshold (set to 35 in v18)") print(self.settings) # figure out the cancer types: self.results = {} def run(self): self.mkdir('ALL') # First analyse all case of TCGA + PANCAN once for all and # store all results in a dictionary. self._analyse_all() def _analyse_all(self): for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print('================================ Analysing %s data' % tcga) self.mkdir('ALL' + os.sep + tcga) # Computes the ANOVA an = ANOVA(self.ic50_filename, gf_filename, self.drug_decode) self.an = an an.settings = ANOVASettings(**self.settings) an.init() # reset the analysis_type automatically results = an.anova_all() # Store the results self.results[tcga] = results print('Analysing %s data and creating images' % tcga) self.report = ANOVAReport(an, self.results[tcga]) self.report.settings.savefig = True self.report.settings.directory = 'ALL/' + tcga self.report.settings.analysis_type = tcga self.report.create_html_pages() def create_data_packages_for_companies(self, companies=None): ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies Ncomp = len(companies) for ii, company in enumerate(companies): print("\n\n========= Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company)) self.mkdir(company) for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print("---------------- for TCGA %s" % tcga) # Read the results previously computed try: results_df = self.results[tcga].df.copy() except: results_path = "ALL/%s/OUTPUT/results.csv" % tcga print("Downloading results from %s" % results_path) results_df = ANOVAResults(results_path) results = ANOVAResults(results_df) # Get a DrugDecode for that company drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # filter results using the new drug decode drug_ids_in_results = get_drug_id(results.df.DRUG_ID) mask = [ True if x in drug_decode_company.df.index else False for x in drug_ids_in_results ] results.df = results.df.ix[mask] # Just to create an instance with the subset of drug_decode # and correct settings. This is also used to store # the entire input data set. So, we must remove all drugs # not relevant for the analysis of this company an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company) def drug_to_keep(drug): to_keep = get_drug_id(drug) in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = company + os.sep + tcga an.settings.analysis_type = tcga self.report = ANOVAReport(an, results) self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) if self.debug is False: self.report.create_html_features() self.report.create_html_associations() # For now, we just copy all DRUG images from # the analysis made in ALL from easydev import shellcmd, Progress print("\nCopying drug files") drug_ids = results.df.DRUG_ID.unique() pb = Progress(len(drug_ids)) for i, drug_id in enumerate(drug_ids): # copy the HTML filename = "%s.html" % drug_id source = "ALL%s%s%s" % (os.sep, tcga, os.sep) dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) #copy the images filename = "volcano_%s.*" % drug_id source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep, os.sep) dest = "%s%s%s%simages%s" % (company, os.sep, tcga, os.sep, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) pb.animate(i + 1) def _get_tcga(self): return [x.split("_")[1].split(".")[0] for x in self.gf_filenames] tcga = property(_get_tcga) def _get_companies(self): return [x for x in self.drug_decode.companies if x != 'Commercial'] companies = property(_get_companies) def create_summary_pages(self, main_directory='ALL'): # Read in ALL all directories # create directories and copy relevant files self.mkdir(main_directory + os.sep + 'images') self.mkdir(main_directory + os.sep + 'css') self.mkdir(main_directory + os.sep + 'js') from gdsctools import gdsctools_data for filename in ['gdsc.css', 'github-gist.css']: target = os.sep.join([main_directory, 'css', filename]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['highlight.pack.js']: target = os.sep.join([main_directory, 'js', filename]) if os.path.isfile(target) is False: filename = gdsctools_data(filename) shutil.copy(filename, target) for filename in ['EBI_logo.png', 'sanger-logo.png']: target = os.sep.join([main_directory, 'images', filename]) if os.path.isfile(target) is False: dire = 'data' + os.sep + 'images' filename = gdsctools_data("images" + os.sep + filename) shutil.copy(filename, target) directories = glob.glob('ALL' + os.sep + '*') directories = [x for x in directories if os.path.isdir(x)] summary = [] for directory in sorted(directories): tcga = directory.split(os.sep)[1] if tcga in ['css', 'images']: continue # number of hits path = directory + os.sep + 'OUTPUT' + os.sep try: hits = pd.read_csv(path + 'drugs_summary.csv', sep=',') except: summary.append([tcga] + [None] * 5) continue total_hits = hits.total.sum() drug_involved = get_drug_id(hits['Unnamed: 0'].unique()) results = ANOVAResults(path + 'results.csv') if len(results) > 0: drug_ids = get_drug_id(results.df.DRUG_ID.unique()) else: drug_ids = [] path = directory + os.sep + 'INPUT' + os.sep drug_decode = DrugDecode(path + 'DRUG_DECODE.csv') info = drug_decode.get_info() webrelease = drug_decode.df.ix[drug_involved].WEBRELEASE drug_inv_public = sum(webrelease == 'Y') drug_inv_prop = sum(webrelease != 'Y') summary.append([ tcga, total_hits, drug_inv_prop, info['N_prop'], drug_inv_public, info['N_public'] ]) df = pd.DataFrame(summary) df.columns = [ 'Analysis name', 'Number of hits', 'Number of involved proprietary compounds', 'out of', 'Number of involved public', 'out of' ] # FIXME include css and images of logo # FIXME save in the proper directory output_dir = main_directory + os.sep + '..' + os.sep output_file = output_dir + os.sep + 'index.html' self.html_page = ReportMAIN(directory='ALL', filename='index.html', template_filename='datapack_summary.html') # Let us use our HTMLTable to add the HTML references from gdsctools.report import HTMLTable self.html_table = HTMLTable(df) self.html_table.add_href('Analysis name', newtab=True, url=None, suffix='/index.html') #html_table.add_bgcolor('Number of hits') self.html_page.jinja['data_table'] = self.html_table.to_html() self.html_page.write() return df def load_results(self): """Find the files results.csv in all TCGA directories""" for tcga in self.tcga: print(tcga) self.results[tcga] = ANOVAResults('ALL' + os.sep + tcga + os.sep + 'OUTPUT' + os.sep + 'results.csv')