def __init__(self, ic50, drug_decode, genomic_feature_pattern="GF_*csv", main_directory="tissue_packages", verbose=True): """.. rubric:: Constructor :param ic50: an :class:`~gdsctools.readers.IC50` file. :param drug_decode: an :class:`~gdsctools.readers.DrugDecode` file. :param genomic_feature_pattern: a glob to a set of :class:`~gdsctools.readers.GenomicFeature` files. """ super(GDSC, self).__init__(genomic_feature_pattern, verbose=verbose) assert isinstance(ic50, str) self.ic50_filename = ic50 self.dd_filename = drug_decode self.main_directory = main_directory self.settings = ANOVASettings() self.settings.animate = False self.drug_decode = DrugDecode(drug_decode) print("Those settings will be used (check FDR_threshold)") print(self.settings) # figure out the cancer types: self.results = {} self.company_directory = "company_packages" # quick test on 15 features self.test = False
def _analyse_all(self, multicore=None): for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(purple('======================== Analysing %s data' % tcga)) self.mkdir(self.main_directory + os.sep + tcga) # Computes the ANOVA try: self.ic50 = IC50(self.ic50_filename) except: print("Clustering IC50 (v18 released data ?)") self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) an = ANOVA(self.ic50, gf_filename, self.drug_decode, verbose=False) if self.test is True: an.features.df = an.features.df[an.features.df.columns[0:15]] self.an = an an.settings = ANOVASettings(**self.settings) an.settings.analysis_type = tcga an.init() # This reset the directory results = an.anova_all(multicore=multicore) an.settings.directory = self.main_directory + os.sep + tcga # Store the results self.results[tcga] = results print('Analysing %s data and creating images' % tcga) self.report = ANOVAReport(an) self.report.settings.savefig = True self.report.create_html_pages(onweb=False)
def __init__(self, ic50, drug_decode, genomic_feature_pattern="GF_*csv", mode='standard'): super(GDSC, self).__init__(genomic_feature_pattern, verbose=True) self.debug = False self.ic50_filename = ic50 self.dd_filename = drug_decode if mode == 'v18': self.ic50 = IC50Cluster(ic50) else: self.ic50 = IC50(ic50) self.drug_decode = DrugDecode(drug_decode) self.settings = ANOVASettings() self.settings.low_memory = True if mode == 'v18': self.settings.FDR_threshold = 35 print( "Those settings will be used (note that low_memory is set " "to True and check the value of FDR_threshold (set to 35 in v18)") print(self.settings) # figure out the cancer types: self.results = {}
def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf: 0, -np.inf: 0}) except: pass # create some data self._set_sensible_df() self.company = None
def __init__(self, data, sep="\t", settings=None): """.. rubric:: Constructor :param data: an :class:`~gdsctools.anova.ANOVAResults` instance or a dataframe with the proper columns names (see below) :param settings: an instance of :class:`~gdsctools.settings.ANOVASettings` Expected column names to be found if a filename is provided:: ANOVA_FEATURE_pval ANOVA_FEATURE_FDR FEATURE_delta_MEAN_IC50 FEATURE_IC50_effect_size N_FEATURE_pos N_FEATURE_pos FEATURE DRUG_ID If the plotting is too slow, you can use the :meth:`selector` to prune the results (most of the data are noise and overlap on the middle bottom area of the plot with little information. """ # a copy since we do may change the data try: # an ANOVAResults contains a df attribute self.df = data.df.copy() except: # probably a dataframe self.df = data.copy() # this is redundant could reuse the input ?? if settings is None: from gdsctools.settings import ANOVASettings self.settings = ANOVASettings() else: self.settings = AttrDict(**settings) self.figtools = Savefig() self.figtools.directory = self.settings.directory self.drugs = set(self.df[self._colname_drugid]) self.features = set(self.df[self._colname_feature]) # intensive calls made once for all self.groups_by_drugs = self.df.groupby(self._colname_drugid).groups self.groups_by_features = self.df.groupby(self._colname_feature).groups
def __init__(self, gdsc, results, sep="\t", drug_decode=None): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all` """ self.figtools = Savefig() self.gdsc = gdsc self.df = ANOVAResults(results).df # this does a copy and sanity check self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) # create some data self._set_sensible_df() # just to create the directory ReportMAIN(directory=self.settings.directory)
def _analyse_all(self): for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print('================================ Analysing %s data' % tcga) self.mkdir('ALL' + os.sep + tcga) # Computes the ANOVA an = ANOVA(self.ic50_filename, gf_filename, self.drug_decode) self.an = an an.settings = ANOVASettings(**self.settings) an.init() # reset the analysis_type automatically results = an.anova_all() # Store the results self.results[tcga] = results print('Analysing %s data and creating images' % tcga) self.report = ANOVAReport(an, self.results[tcga]) self.report.settings.savefig = True self.report.settings.directory = 'ALL/' + tcga self.report.settings.analysis_type = tcga self.report.create_html_pages()
def __init__(self, ic50, genomic_features=None, drug_decode=None, verbose=True, low_memory=True, set_media_factor=False): """.. rubric:: Constructor :param DataFrame IC50: a dataframe with the IC50. Rows should be the COSMIC identifiers and columns should be the Drug names (or identifiers) :param features: another dataframe with rows as in the IC50 matrix and columns as features. The first 3 columns must be named specifically to hold tissues, MSI (see format). :param drug_decode: a 3 column CSV file with drug's name and targets see :mod:`readers` for more information. :param verbose: verbosity in "WARNING", "ERROR", "DEBUG", "INFO" The attribute :attr:`settings` contains specific settings related to the analysis or visulation. """ self.verbose = verbose self._init_called = False # We first need to read the IC50 using a dedicated reader self.ic50 = readers.IC50(ic50) # Create a dictionary version of the data # to be accessed per drug where NA have already been # removed. Each drug is a dictionary with 2 keys: # Y for the data and indices for the cosmicID where # there is an IC50 measured. ic50_parse = self.ic50.df.copy().unstack().dropna() self.ic50_dict = dict([(d, {'indices': ic50_parse.ix[d].index, 'Y':ic50_parse.ix[d].values}) for d in self.ic50.drugIds]) # Reads features if provided, otherwise use a default data set if genomic_features is None: # Reads default version provided with the package self.features = readers.GenomicFeatures() else: self.features = readers.GenomicFeatures(genomic_features) if self.features.found_media is False and \ set_media_factor is True: if self.verbose: print('Populating MEDIA Factor in the Genomic Feature matrix') self.features.fill_media_factor() #: a CSV with 3 columns used in the report self.read_drug_decode(drug_decode) # create the multiple testing factory used in anova_all() self.multiple_testing = MultipleTesting() # We prune the genomic features by settings the cosmic ids of # the features to be those of the cosmic ids of the IC50. See # readers module. This affectation, prune the features dataframe # automatically. This fails if a cosmic identifier is not # found in the features' cosmic ids, so let us catch the error # before hand to give a unknowns = set(self.ic50.cosmicIds).difference( set(self.features.cosmicIds)) if len(unknowns) > 0: print("WARNING: " + "%s cosmic identifiers in your IC50 " % len(unknowns) + "could not be found in the genomic feature matrix. "+ "They will be dropped. Consider using a user-defined " + "genomic features matrix") self.ic50.drop_cosmic(list(unknowns)) self.features.cosmicIds = self.ic50.cosmicIds #self.cosmicIds = self.ic50.cosmicIds #: an instance of :class:`~gdsctools.settings.ANOVASettings` self.settings = ANOVASettings() self.settings.low_memory = low_memory # alias to all column names to store results # cast to list (Python3). self.column_names = list(ANOVAResults().mapping.keys()) # skip assoc_id for now self._odof_dict = dict([(name, None) for name in self.column_names]) # a cache to store ANOVA results for each drug self.individual_anova = {} # must be called if ic50 or features are changed. self.init()
class BaseModels(object): """A Base class for ANOVA / ElaticNet models """ def __init__(self, ic50, genomic_features=None, drug_decode=None, verbose=True, low_memory=True, set_media_factor=False): """.. rubric:: Constructor :param DataFrame IC50: a dataframe with the IC50. Rows should be the COSMIC identifiers and columns should be the Drug names (or identifiers) :param features: another dataframe with rows as in the IC50 matrix and columns as features. The first 3 columns must be named specifically to hold tissues, MSI (see format). :param drug_decode: a 3 column CSV file with drug's name and targets see :mod:`readers` for more information. :param verbose: verbosity in "WARNING", "ERROR", "DEBUG", "INFO" The attribute :attr:`settings` contains specific settings related to the analysis or visulation. """ self.verbose = verbose self._init_called = False # We first need to read the IC50 using a dedicated reader self.ic50 = readers.IC50(ic50) # Create a dictionary version of the data # to be accessed per drug where NA have already been # removed. Each drug is a dictionary with 2 keys: # Y for the data and indices for the cosmicID where # there is an IC50 measured. ic50_parse = self.ic50.df.copy().unstack().dropna() self.ic50_dict = dict([(d, {'indices': ic50_parse.ix[d].index, 'Y':ic50_parse.ix[d].values}) for d in self.ic50.drugIds]) # Reads features if provided, otherwise use a default data set if genomic_features is None: # Reads default version provided with the package self.features = readers.GenomicFeatures() else: self.features = readers.GenomicFeatures(genomic_features) if self.features.found_media is False and \ set_media_factor is True: if self.verbose: print('Populating MEDIA Factor in the Genomic Feature matrix') self.features.fill_media_factor() #: a CSV with 3 columns used in the report self.read_drug_decode(drug_decode) # create the multiple testing factory used in anova_all() self.multiple_testing = MultipleTesting() # We prune the genomic features by settings the cosmic ids of # the features to be those of the cosmic ids of the IC50. See # readers module. This affectation, prune the features dataframe # automatically. This fails if a cosmic identifier is not # found in the features' cosmic ids, so let us catch the error # before hand to give a unknowns = set(self.ic50.cosmicIds).difference( set(self.features.cosmicIds)) if len(unknowns) > 0: print("WARNING: " + "%s cosmic identifiers in your IC50 " % len(unknowns) + "could not be found in the genomic feature matrix. "+ "They will be dropped. Consider using a user-defined " + "genomic features matrix") self.ic50.drop_cosmic(list(unknowns)) self.features.cosmicIds = self.ic50.cosmicIds #self.cosmicIds = self.ic50.cosmicIds #: an instance of :class:`~gdsctools.settings.ANOVASettings` self.settings = ANOVASettings() self.settings.low_memory = low_memory # alias to all column names to store results # cast to list (Python3). self.column_names = list(ANOVAResults().mapping.keys()) # skip assoc_id for now self._odof_dict = dict([(name, None) for name in self.column_names]) # a cache to store ANOVA results for each drug self.individual_anova = {} # must be called if ic50 or features are changed. self.init() def _autoset_msi_factor(self): if self.features.found_msi: # if the number of pos. (or neg.) factors is not large enough then # the MSI factor is not used msi_name = self.features.colnames.msi self.msi_factor = self.features.df[msi_name] total = len(self.msi_factor) positives = self.msi_factor.sum() negatives = total - positives # we must have at least 2 positives or 2 negative # This is therefore a < comparison here below. See in # _get_one_drug_one_feature_data that we use >= which # is consistent. if positives < self.settings.MSI_factor_threshold: self.settings.include_MSI_factor = False if negatives < self.settings.MSI_factor_threshold: self.settings.include_MSI_factor = False else: self.settings.include_MSI_factor = False self.settings.analysis_type = 'feature_only' def _autoset_tissue_factor(self): # select tissue based on the features tissue_name = self.features.colnames.tissue self.tissue_factor = self.features.df[tissue_name] if len(self.tissue_factor.unique()) == 1: # there is only one tissue tissue = self.tissue_factor.unique()[0] self.settings.analysis_type = tissue self.settings.directory = tissue else: # this is a PANCAN analysis self.settings.analysis_type = 'PANCAN' def _autoset_media_factor(self): if self.settings.analysis_type != 'PANCAN': self.settings.include_media_factor = False elif self.features.found_media is True: self.settings.include_media_factor = True colname = self.features.colnames.media self.media_factor = self.features.df[colname] else: self.settings.include_media_factor = False def set_cancer_type(self, ctype=None): """Select only a set of tissues. Input IC50 may be PANCAN (several cancer tissues). This function can be used to select a subset of tissues. This function changes the :attr:`ic50` dataframe and possibly the feature as well if some are not relevant anymore (sum of the column is zero for instance). """ if ctype is None: return if ctype == 'PANCAN': # Nothing to do, keep everything return if isinstance(ctype, str): ctype = [str(ctype)] for this in ctype: assert this in self.features.tissues, "%s not found" % ctype # keep only features that correspond to the tissue self.features.keep_tissue_in(ctype) self.ic50.df = self.ic50.df.ix[self.features.df.index] self.init() def read_settings(self, settings): """Read settings and update cancer type if set""" self.settings.from_json(settings) self.set_cancer_type(self.settings.analysis_type) def init(self): # Some preprocessing to speed up data access ic50_parse = self.ic50.df.copy().unstack().dropna() # for each drug, we store the IC50s (Y) and corresponding indices # of cosmic identifiers self.ic50_dict = dict([ (d, {'indices': ic50_parse.ix[d].index, 'Y': ic50_parse.ix[d].values}) for d in self.ic50.drugIds]) # save the tissues self._autoset_tissue_factor() # and MSI (Microsatellite instability) status of the samples. self._autoset_msi_factor() # and (growth) media factor self._autoset_media_factor() # dictionaries to speed up code. self.features_dict = {} self.msi_dict = {} self.tissue_dict = {} self.media_dict = {} # fill the dictionaries for each drug once for all for drug_name in self.ic50.drugIds: indices = self.ic50_dict[drug_name]['indices'] # if we were to store all drugs /features, this takes # 1Gb of memory for 265 drugs and 680 features. This is # therefore not scalable, especially for multiprocessing. if self.settings.low_memory is True: pass else: self.features_dict[drug_name] = self.features.df.ix[indices] # MSI, media and tissue are not large data files and can be store if self.features.found_msi: self.msi_dict[drug_name] = self.msi_factor.ix[indices] if self.features.found_media: self.media_dict[drug_name] = self.media_factor.ix[indices] self.tissue_dict[drug_name] = self.tissue_factor.ix[indices] # some preprocessing for the OLS computation. # We create the dummies for the tissue factor once for all # Note that to agree with R convention, we have to resort the column # to agree with R convention that is a<B==b<c instead of # where A<B<C<a<b<c (in R) self._tissue_dummies = pd.get_dummies(self.tissue_factor) columns = self._tissue_dummies.columns columns = sorted(columns, key=lambda s: s.lower()) columns = ['C(tissue)[T.' + x + ']' for x in columns] self._tissue_dummies.columns = columns if self.settings.include_media_factor: self._media_dummies = pd.get_dummies(self.media_factor) columns = self._media_dummies.columns columns = ['C(media)[T.' + x + ']' for x in columns] self._media_dummies.columns = columns for col in columns: self._tissue_dummies[col] = self._media_dummies[col] N = len(self._tissue_dummies) self._tissue_dummies['C(msi)[T.1]'] = [1]*N self._tissue_dummies['feature'] = [1] * N self._tissue_dummies.insert(0, 'Intercept', [1] * N) # drop first feature in the tissues that seems to be used as a # reference in the regression tissues = [x for x in self._tissue_dummies.columns if 'tissue' in x] self._tissue_dummies.drop(tissues[0], axis=1, inplace=True) if self.settings.include_media_factor: media = [x for x in self._tissue_dummies.columns if 'media' in x] self._tissue_dummies.drop(media[0], axis=1, inplace=True) # reset the buffer. self.individual_anova = {} if self.verbose and self._init_called is False: for this in ['tissue', 'media', 'msi', 'feature']: if this in self._get_analysis_mode(): print(this.upper() + " FACTOR : included") else: print(this.upper() + " FACTOR : NOT included") self._init_called = True def _get_cosmics(self): return self.ic50.cosmicIds def _set_cosmics(self, cosmics): self.ic50.cosmicIds = cosmics self.features.cosmicIds = cosmics self.init() self.individual_anova = {} cosmicIds = property(_get_cosmics, _set_cosmics, doc="get/set the cosmic identifiers in the IC50 and feature matrices") def _get_drug_names(self): return self.ic50.drugIds def _set_drug_names(self, drugs): self.ic50.drugIds = drugs self.init() # not need to init this again ? self.individual_anova = {} drugIds = property(_get_drug_names, _set_drug_names, doc="Get/Set drug identifers") def _get_feature_names(self): shift = self.features.shift return self.features.features[shift:] def _set_features_names(self, features): self.features.features = features self.init() self.individual_anova = {} feature_names = property(_get_feature_names, _set_features_names, doc="Get/Set feature names") def _get_analysis_mode(self): modes = [] if self.settings.analysis_type == 'PANCAN': modes.append('tissue') if self.settings.include_MSI_factor is True: modes.append('msi') if self.settings.include_media_factor is True: modes.append('media') modes.append('feature') return modes def diagnostics(self): """Return dataframe with information about the analysis """ n_drugs = len(self.ic50.drugIds) n_features = len(self.features.features) - self.features.shift n_combos = n_drugs * n_features feasible = 0 pb = Progress(n_drugs, 1) counter = 0 for drug in self.ic50.drugIds: for feature in self.features.features[self.features.shift:]: dd = self._get_one_drug_one_feature_data(drug, feature, diagnostic_only=True) if dd.status is True: feasible += 1 counter += 1 pb.animate(counter) results = { 'n_drug': n_drugs, 'n_combos': n_combos, 'feasible_tests': feasible, 'percentage_feasible_tests': float(feasible)/n_combos*100} return results def read_drug_decode(self, filename=None): """Read file with the DRUG information .. seealso:: :class:`gdsctools.readers.DrugDecode` """ # Read the DRUG decoder file into a DrugDecode/Reader instance self.drug_decode = readers.DrugDecode(filename) def __str__(self): txt = self.ic50.__str__() txt += "\n" + self.features.__str__() return txt def __repr__(self): txt = self.__str__() return txt
def anova_all(self, animate=True, drugs=None): """Run all ANOVA tests for all drugs and all features. :param drugs: you may select a subset of drugs :param animate: shows the progress bar :return: an :class:`~gdsctools.anova_results.ANOVAResults` instance with the dataframe stored in an attribute called **df** Loops over all drugs calling :meth:`anova_one_drug` for each drug and concatenating all results together. Note that once all data are gathered, an extra column containing the FDR corrections is added to the dataframe using :meth:`add_pvalues_correction` method. An extra column named "ASSOC_ID" is also added with a unique identifer sorted by ascending FDR. .. note:: A thorough comparison with version v17 give the same FDR results (difference ~1e-6); Note however that the qvalue results differ by about 0.3% due to different smoothing in R and Python. """ # drop DRUG where number of IC50 (non-null) is below 5 # axis=0 is default but we emphasize that sum is over # column (i.e. drug vv = (self.ic50.df.isnull() == False).sum(axis=0) # FIXME: should be in one_drug_one_feature ?? drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50] # if user provided a list of drugs, use them: if drugs is not None: # todo: check valifity of the drug names drug_names = drugs[:] pb = Progress(len(drug_names), 1) drug_names = list(drug_names) pylab.shuffle(drug_names) if animate is True: pb.animate(0) for i, drug_name in enumerate(drug_names): if drug_name in self.individual_anova.keys(): pass else: res = self.anova_one_drug(drug_name, animate=False, output='dataframe') self.individual_anova[drug_name] = res if animate is True: pb.animate(i + 1) print("\n") if len(self.individual_anova) == 0: return ANOVAResults() df = pd.concat(self.individual_anova, ignore_index=True) if len(df) == 0: return df # sort all data by ANOVA p-values try: df.sort_values('ANOVA_FEATURE_pval', inplace=True) except: df.sort('ANOVA_FEATURE_pval', inplace=True) # all ANOVA have been computed individually for each drug and each # feature. Now, we need to compute the multiple testing corrections if self.settings.pvalue_correction_level == 'global': df = self.add_pvalues_correction(df) # insert a unique identifier as first column df.insert(0, 'ASSOC_ID', range(1, len(df) + 1)) self.df = df # order the column names as defined in the __init__ method df = df[self.column_names] df.reset_index(inplace=True, drop=True) results = ANOVAResults() results.df = df results.settings = ANOVASettings(**self.settings) return results
def anova_one_drug(self, drug_id, animate=True, output='object'): """Computes ANOVA for a given drug across all features :param str drug_id: a valid drug identifier. :param animate: shows the progress bar :return: a dataframe Calls :meth:`anova_one_drug_one_feature` for each feature. """ # drop first and second columns that are made of strings # works under python2 but not python 3. Assume that the 2 first #columns are the sample name and tissue feature # Then, we keep only cases with at least 3 features. # MSI could be used but is not like in original R code. features = self.features.df.copy() # need to skip the FACTOR to keep only features shift = self.features.shift features = features[features.columns[shift:]] # FIXME what about features with less than 3 zeros ? mask = features.sum(axis=0) >= 3 # TODO: MSI, tissues, name must always be kept # selected_features = features[features.columns[mask]] # scan all features for a given drug assert drug_id in self.ic50.df.columns N = len(selected_features.columns) pb = Progress(N, 10) res = {} # for i, feature in enumerate(selected_features.columns): # production True, means we do not want to create a DataFrame # for each call to the anova_one_drug_one_feature function # Instead, we require dictionaries this = self.anova_one_drug_one_feature(drug_id, feature, production=True) if this['ANOVA_FEATURE_pval'] is not None: res[feature] = this if animate is True: pb.animate(i + 1) # if production is False: # df = pid.concat(res, ignore_index=True) df = pd.DataFrame.from_records(res) df = df.T df = ANOVAResults().astype(df) if len(df) == 0: return df # append DRUG_NAME/DRUG_TARGET columns df = self.drug_decode.drug_annotations(df) # TODO: drop rows where ANOVA_FEATURE_PVAL is None if output != 'object': df = self.add_pvalues_correction(df) return df else: df = self.add_pvalues_correction(df) res = ANOVAResults(df, self.settings) res.settings = ANOVASettings(**self.settings) return res
def create_data_packages_for_companies(self, companies=None): """Creates a data package for each company found in the DrugDecode file """ ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## # companies must be just one name (one string) or a list of strings # By default, takes all companies found in DrugDecode if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies if len(companies) == 0: raise ValueError( "Could not find any companies in the DrugDecode file") # The main directory self.mkdir(self.company_directory) # Loop over all companies, retrieving information built # in analyse() method, selecting for each TCGA all information # for that company only (and public drugs) Ncomp = len(companies) for ii, company in enumerate(companies): print( purple("\n=========== Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company))) self.mkdir(self.company_directory + os.sep + company) # Handle each TCGA case separately for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(brown(" ------- building TCGA %s sub directory" % tcga)) # Read the results previously computed either try: results_df = self.results[tcga].df.copy() except: results_path = "%s/%s/OUTPUT/results.csv" % ( self.main_directory, tcga) results_df = ANOVAResults(results_path) # MAke sure the results are formatted correctly results = ANOVAResults(results_df) # Get the DrugDecode information for that company only drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # Filter the results to keep only public drugs and that # company. Make sure this is integers results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int) mask = [ True if x in drug_decode_company.df.index else False for x in results.df.DRUG_ID ] results.df = results.df.ix[mask] # We read the IC50 again try: self.ic50 = IC50(self.ic50_filename) except: self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) # And create an ANOVA instance. This is not to do the analyse # again but to hold various information an = ANOVA(self.ic50, gf_filename, drug_decode_company, verbose=False) def drug_to_keep(drug): to_keep = drug in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga an.settings.analysis_type = tcga # Now we create the report self.report = ANOVAReport(an, results, drug_decode=drug_decode_company, verbose=self.verbose) self.report.company = company self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) self.report.create_html_features() self.report.create_html_drugs() self.report.create_html_associations()
def create_data_packages_for_companies(self, companies=None): ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies Ncomp = len(companies) for ii, company in enumerate(companies): print("\n\n========= Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company)) self.mkdir(company) for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print("---------------- for TCGA %s" % tcga) # Read the results previously computed try: results_df = self.results[tcga].df.copy() except: results_path = "ALL/%s/OUTPUT/results.csv" % tcga print("Downloading results from %s" % results_path) results_df = ANOVAResults(results_path) results = ANOVAResults(results_df) # Get a DrugDecode for that company drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # filter results using the new drug decode drug_ids_in_results = get_drug_id(results.df.DRUG_ID) mask = [ True if x in drug_decode_company.df.index else False for x in drug_ids_in_results ] results.df = results.df.ix[mask] # Just to create an instance with the subset of drug_decode # and correct settings. This is also used to store # the entire input data set. So, we must remove all drugs # not relevant for the analysis of this company an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company) def drug_to_keep(drug): to_keep = get_drug_id(drug) in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = company + os.sep + tcga an.settings.analysis_type = tcga self.report = ANOVAReport(an, results) self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) if self.debug is False: self.report.create_html_features() self.report.create_html_associations() # For now, we just copy all DRUG images from # the analysis made in ALL from easydev import shellcmd, Progress print("\nCopying drug files") drug_ids = results.df.DRUG_ID.unique() pb = Progress(len(drug_ids)) for i, drug_id in enumerate(drug_ids): # copy the HTML filename = "%s.html" % drug_id source = "ALL%s%s%s" % (os.sep, tcga, os.sep) dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) #copy the images filename = "volcano_%s.*" % drug_id source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep, os.sep) dest = "%s%s%s%simages%s" % (company, os.sep, tcga, os.sep, os.sep) cmd = "cp %s%s %s" % (source, filename, dest) shellcmd(cmd, verbose=False) pb.animate(i + 1)