Python ANOVASettings示例，gdsctools.settings.ANOVASettings Python示例

示例#1

0

显示文件

    def __init__(self,
                 ic50,
                 drug_decode,
                 genomic_feature_pattern="GF_*csv",
                 main_directory="tissue_packages",
                 verbose=True):
        """.. rubric:: Constructor

        :param ic50: an :class:`~gdsctools.readers.IC50` file.
        :param drug_decode: an :class:`~gdsctools.readers.DrugDecode` file.
        :param genomic_feature_pattern: a glob to a set of
            :class:`~gdsctools.readers.GenomicFeature` files.

        """
        super(GDSC, self).__init__(genomic_feature_pattern, verbose=verbose)
        assert isinstance(ic50, str)
        self.ic50_filename = ic50
        self.dd_filename = drug_decode
        self.main_directory = main_directory

        self.settings = ANOVASettings()
        self.settings.animate = False
        self.drug_decode = DrugDecode(drug_decode)

        print("Those settings will be used (check FDR_threshold)")
        print(self.settings)

        # figure out the cancer types:
        self.results = {}

        self.company_directory = "company_packages"

        # quick test on 15 features
        self.test = False

示例#2

0

显示文件

    def _analyse_all(self, multicore=None):
        for gf_filename in sorted(self.gf_filenames):
            tcga = gf_filename.split("_")[1].split('.')[0]
            print(purple('======================== Analysing %s data' % tcga))

            self.mkdir(self.main_directory + os.sep + tcga)
            # Computes the ANOVA
            try:
                self.ic50 = IC50(self.ic50_filename)
            except:
                print("Clustering IC50 (v18 released data ?)")
                self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)
            an = ANOVA(self.ic50, gf_filename, self.drug_decode, verbose=False)

            if self.test is True:
                an.features.df = an.features.df[an.features.df.columns[0:15]]

            self.an = an
            an.settings = ANOVASettings(**self.settings)
            an.settings.analysis_type = tcga
            an.init()  # This reset the directory

            results = an.anova_all(multicore=multicore)
            an.settings.directory = self.main_directory + os.sep + tcga
            # Store the results
            self.results[tcga] = results

            print('Analysing %s data and creating images' % tcga)
            self.report = ANOVAReport(an)
            self.report.settings.savefig = True

            self.report.create_html_pages(onweb=False)

示例#3

0

显示文件

文件： gdsc.py 项目： howard-lightfoot/gdsctools

    def __init__(self,
                 ic50,
                 drug_decode,
                 genomic_feature_pattern="GF_*csv",
                 mode='standard'):
        super(GDSC, self).__init__(genomic_feature_pattern, verbose=True)
        self.debug = False
        self.ic50_filename = ic50
        self.dd_filename = drug_decode

        if mode == 'v18':
            self.ic50 = IC50Cluster(ic50)
        else:
            self.ic50 = IC50(ic50)

        self.drug_decode = DrugDecode(drug_decode)
        self.settings = ANOVASettings()
        self.settings.low_memory = True

        if mode == 'v18':
            self.settings.FDR_threshold = 35

        print(
            "Those settings will be used (note that low_memory is set "
            "to True and check the value of FDR_threshold (set to 35 in v18)")
        print(self.settings)

        # figure out the cancer types:
        self.results = {}

示例#4

0

显示文件

文件： anova_report.py 项目： Mazda35/gdsctools

    def __init__(self,
                 gdsc,
                 results=None,
                 sep="\t",
                 drug_decode=None,
                 verbose=True):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`. If
            not provided, the ANOVA is run on the fly.

        """
        self.verbose = verbose
        self.figtools = Savefig(verbose=False)
        self.gdsc = gdsc

        if results is None:
            results = gdsc.anova_all()
        self.df = ANOVAResults(results).df  # this does a copy and sanity check

        # Make sure the DRUG are integers
        self.df.DRUG_ID = self.df.DRUG_ID.astype(int)

        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)
        #if sum(self.df == np.inf).sum()>0:
        #    print("WARNING: infinite values were found in your results... Set to zero")
        try:
            self.df = self.df.replace({np.inf: 0, -np.inf: 0})
        except:
            pass

        # create some data
        self._set_sensible_df()

        self.company = None

示例#5

0

显示文件

文件： volcano.py 项目： howard-lightfoot/gdsctools

    def __init__(self, data, sep="\t", settings=None):
        """.. rubric:: Constructor

        :param data: an :class:`~gdsctools.anova.ANOVAResults` instance
            or a dataframe with the proper columns names (see below)
        :param settings: an instance of
            :class:`~gdsctools.settings.ANOVASettings`

        Expected column names to be found if a filename is provided::

            ANOVA_FEATURE_pval
            ANOVA_FEATURE_FDR
            FEATURE_delta_MEAN_IC50
            FEATURE_IC50_effect_size
            N_FEATURE_pos
            N_FEATURE_pos
            FEATURE
            DRUG_ID

        If the plotting is too slow, you can use the :meth:`selector` to prune
        the results (most of the data are noise and overlap on the middle
        bottom  area of the plot with little information.

        """
        # a copy since we do may change the data
        try:
            # an ANOVAResults contains a df attribute
            self.df = data.df.copy()
        except:
            # probably a dataframe
            self.df = data.copy()

        # this is redundant could reuse the input ??
        if settings is None:
            from gdsctools.settings import ANOVASettings
            self.settings = ANOVASettings()
        else:
            self.settings = AttrDict(**settings)

        self.figtools = Savefig()
        self.figtools.directory = self.settings.directory

        self.drugs = set(self.df[self._colname_drugid])
        self.features = set(self.df[self._colname_feature])

        # intensive calls made once for all
        self.groups_by_drugs = self.df.groupby(self._colname_drugid).groups
        self.groups_by_features = self.df.groupby(self._colname_feature).groups

示例#6

0

显示文件

文件： anova_report.py 项目： howard-lightfoot/gdsctools

    def __init__(self, gdsc, results, sep="\t", drug_decode=None):
        """.. rubric:: Constructor

        :param gdsc: the instance with which you created the results to report
        :param results: the results returned by :meth:`ANOVA.anova_all`

        """
        self.figtools = Savefig()
        self.gdsc = gdsc
        self.df = ANOVAResults(results).df  # this does a copy and sanity check

        self.settings = ANOVASettings()
        for k, v in gdsc.settings.items():
            self.settings[k] = v

        self._colname_drug_id = 'DRUG_ID'
        self.varname_pval = 'ANOVA_FEATURE_pval'
        self.varname_qval = 'ANOVA_FEATURE_FDR'

        # maybe there was not drug_decode in the gdsc parameter,
        # so a user may have provide a file, in which case, we need
        # to update the content of the dur_decoder.
        if len(gdsc.drug_decode) == 0 and drug_decode is None:
            warnings.warn("No drug name or target will be populated."
                          "You may want to provide a DRUG_DECODE file.")
            self.drug_decode = DrugDecode()
        elif drug_decode is not None:
            # Read a file
            self.drug_decode = DrugDecode(drug_decode)
        else:
            # Copy from gdsc instance
            self.drug_decode = DrugDecode(gdsc.drug_decode)
        self.df = self.drug_decode.drug_annotations(self.df)

        # create some data
        self._set_sensible_df()

        # just to create the directory
        ReportMAIN(directory=self.settings.directory)

示例#7

0

显示文件

文件： gdsc.py 项目： howard-lightfoot/gdsctools

    def _analyse_all(self):
        for gf_filename in sorted(self.gf_filenames):
            tcga = gf_filename.split("_")[1].split('.')[0]
            print('================================ Analysing %s data' % tcga)

            self.mkdir('ALL' + os.sep + tcga)

            # Computes the ANOVA
            an = ANOVA(self.ic50_filename, gf_filename, self.drug_decode)
            self.an = an
            an.settings = ANOVASettings(**self.settings)
            an.init()  # reset the analysis_type automatically
            results = an.anova_all()

            # Store the results
            self.results[tcga] = results

            print('Analysing %s data and creating images' % tcga)
            self.report = ANOVAReport(an, self.results[tcga])
            self.report.settings.savefig = True
            self.report.settings.directory = 'ALL/' + tcga
            self.report.settings.analysis_type = tcga
            self.report.create_html_pages()

示例#8

0

显示文件

文件： models.py 项目： howard-lightfoot/gdsctools

    def __init__(self, ic50, genomic_features=None,
            drug_decode=None, verbose=True, low_memory=True,
            set_media_factor=False):
        """.. rubric:: Constructor

        :param DataFrame IC50: a dataframe with the IC50. Rows should be
            the COSMIC identifiers and columns should be the Drug names
            (or identifiers)
        :param features: another dataframe with rows as in the IC50 matrix
            and columns as features.  The first 3 columns must be named
            specifically to hold tissues, MSI (see format).
        :param drug_decode: a 3 column CSV file with drug's name and targets
            see :mod:`readers` for more information.
        :param verbose: verbosity in "WARNING", "ERROR", "DEBUG", "INFO"

        The attribute :attr:`settings` contains specific settings related
        to the analysis or visulation.
        """
        self.verbose = verbose
        self._init_called = False

        # We first need to read the IC50 using a dedicated reader
        self.ic50 = readers.IC50(ic50)

        # Create a dictionary version of the data
        # to be accessed per drug where NA have already been
        # removed. Each drug is a dictionary with 2 keys:
        # Y for the data and indices for the cosmicID where
        # there is an IC50 measured.
        ic50_parse = self.ic50.df.copy().unstack().dropna()
        self.ic50_dict = dict([(d, {'indices': ic50_parse.ix[d].index,
            'Y':ic50_parse.ix[d].values}) for d in self.ic50.drugIds])

        # Reads features if provided, otherwise use a default data set
        if genomic_features is None:
            # Reads default version provided with the package
            self.features = readers.GenomicFeatures()
        else:
            self.features = readers.GenomicFeatures(genomic_features)

        if self.features.found_media is False and \
                set_media_factor is True:
            if self.verbose:
                print('Populating MEDIA Factor in the Genomic Feature matrix')
            self.features.fill_media_factor()


        #: a CSV with 3 columns used in the report
        self.read_drug_decode(drug_decode)

        # create the multiple testing factory used in anova_all()
        self.multiple_testing = MultipleTesting()

        # We prune the genomic features by settings the cosmic ids of
        # the features to be those of the cosmic ids of the IC50. See
        # readers module. This affectation, prune the features dataframe
        # automatically. This fails if a cosmic identifier is not
        # found in the features' cosmic ids, so let us catch the error
        # before hand to give a
        unknowns = set(self.ic50.cosmicIds).difference(
                set(self.features.cosmicIds))

        if len(unknowns) > 0:
            print("WARNING: " +
                "%s cosmic identifiers in your IC50 " % len(unknowns) +
                "could not be found in the genomic feature matrix. "+
                "They will be dropped. Consider using a user-defined " +
                "genomic features matrix")

        self.ic50.drop_cosmic(list(unknowns))
        self.features.cosmicIds = self.ic50.cosmicIds
        #self.cosmicIds = self.ic50.cosmicIds

        #: an instance of :class:`~gdsctools.settings.ANOVASettings`
        self.settings = ANOVASettings()
        self.settings.low_memory = low_memory

        # alias to all column names to store results
        # cast to list (Python3).
        self.column_names = list(ANOVAResults().mapping.keys())

        # skip assoc_id for now
        self._odof_dict = dict([(name, None)
            for name in self.column_names])

        # a cache to store ANOVA results for each drug
        self.individual_anova = {}

        # must be called if ic50 or features are changed.
        self.init()

示例#9

0

显示文件

文件： models.py 项目： howard-lightfoot/gdsctools

class BaseModels(object): 
    """A Base class for ANOVA / ElaticNet models


    """
    def __init__(self, ic50, genomic_features=None,
            drug_decode=None, verbose=True, low_memory=True,
            set_media_factor=False):
        """.. rubric:: Constructor

        :param DataFrame IC50: a dataframe with the IC50. Rows should be
            the COSMIC identifiers and columns should be the Drug names
            (or identifiers)
        :param features: another dataframe with rows as in the IC50 matrix
            and columns as features.  The first 3 columns must be named
            specifically to hold tissues, MSI (see format).
        :param drug_decode: a 3 column CSV file with drug's name and targets
            see :mod:`readers` for more information.
        :param verbose: verbosity in "WARNING", "ERROR", "DEBUG", "INFO"

        The attribute :attr:`settings` contains specific settings related
        to the analysis or visulation.
        """
        self.verbose = verbose
        self._init_called = False

        # We first need to read the IC50 using a dedicated reader
        self.ic50 = readers.IC50(ic50)

        # Create a dictionary version of the data
        # to be accessed per drug where NA have already been
        # removed. Each drug is a dictionary with 2 keys:
        # Y for the data and indices for the cosmicID where
        # there is an IC50 measured.
        ic50_parse = self.ic50.df.copy().unstack().dropna()
        self.ic50_dict = dict([(d, {'indices': ic50_parse.ix[d].index,
            'Y':ic50_parse.ix[d].values}) for d in self.ic50.drugIds])

        # Reads features if provided, otherwise use a default data set
        if genomic_features is None:
            # Reads default version provided with the package
            self.features = readers.GenomicFeatures()
        else:
            self.features = readers.GenomicFeatures(genomic_features)

        if self.features.found_media is False and \
                set_media_factor is True:
            if self.verbose:
                print('Populating MEDIA Factor in the Genomic Feature matrix')
            self.features.fill_media_factor()


        #: a CSV with 3 columns used in the report
        self.read_drug_decode(drug_decode)

        # create the multiple testing factory used in anova_all()
        self.multiple_testing = MultipleTesting()

        # We prune the genomic features by settings the cosmic ids of
        # the features to be those of the cosmic ids of the IC50. See
        # readers module. This affectation, prune the features dataframe
        # automatically. This fails if a cosmic identifier is not
        # found in the features' cosmic ids, so let us catch the error
        # before hand to give a
        unknowns = set(self.ic50.cosmicIds).difference(
                set(self.features.cosmicIds))

        if len(unknowns) > 0:
            print("WARNING: " +
                "%s cosmic identifiers in your IC50 " % len(unknowns) +
                "could not be found in the genomic feature matrix. "+
                "They will be dropped. Consider using a user-defined " +
                "genomic features matrix")

        self.ic50.drop_cosmic(list(unknowns))
        self.features.cosmicIds = self.ic50.cosmicIds
        #self.cosmicIds = self.ic50.cosmicIds

        #: an instance of :class:`~gdsctools.settings.ANOVASettings`
        self.settings = ANOVASettings()
        self.settings.low_memory = low_memory

        # alias to all column names to store results
        # cast to list (Python3).
        self.column_names = list(ANOVAResults().mapping.keys())

        # skip assoc_id for now
        self._odof_dict = dict([(name, None)
            for name in self.column_names])

        # a cache to store ANOVA results for each drug
        self.individual_anova = {}

        # must be called if ic50 or features are changed.
        self.init()

    def _autoset_msi_factor(self):
        if self.features.found_msi:
            # if the number of pos. (or neg.) factors is not large enough then
            # the MSI factor is not used
            msi_name = self.features.colnames.msi
            self.msi_factor = self.features.df[msi_name]
            total = len(self.msi_factor)
            positives = self.msi_factor.sum()
            negatives = total - positives

            # we must have at least 2 positives or 2 negative
            # This is therefore a < comparison here below. See in
            # _get_one_drug_one_feature_data that we use >= which
            # is consistent.
            if positives < self.settings.MSI_factor_threshold:
                self.settings.include_MSI_factor = False
            if negatives < self.settings.MSI_factor_threshold:
                self.settings.include_MSI_factor = False
        else:
            self.settings.include_MSI_factor = False
            self.settings.analysis_type = 'feature_only'

    def _autoset_tissue_factor(self):
        # select tissue based on the features
        tissue_name = self.features.colnames.tissue
        self.tissue_factor = self.features.df[tissue_name]
        if len(self.tissue_factor.unique()) == 1:
            # there is only one tissue
            tissue = self.tissue_factor.unique()[0]
            self.settings.analysis_type = tissue
            self.settings.directory = tissue
        else:
            # this is a PANCAN analysis
            self.settings.analysis_type = 'PANCAN'

    def _autoset_media_factor(self):

        if self.settings.analysis_type != 'PANCAN':
            self.settings.include_media_factor = False
        elif self.features.found_media is True:
            self.settings.include_media_factor = True
            colname = self.features.colnames.media
            self.media_factor = self.features.df[colname]
        else:
            self.settings.include_media_factor = False

    def set_cancer_type(self, ctype=None):
        """Select only a set of tissues.

        Input IC50 may be PANCAN (several cancer tissues).
        This  function can be used to select a subset of tissues.
        This function changes the :attr:`ic50` dataframe and possibly
        the feature as well if some are not relevant anymore (sum of the
        column is zero for instance).

        """
        if ctype is None:
            return

        if ctype == 'PANCAN':
            # Nothing to do, keep everything
            return

        if isinstance(ctype, str):
            ctype = [str(ctype)]

        for this in ctype:
            assert this in self.features.tissues, "%s not found" % ctype

        # keep only features that correspond to the tissue
        self.features.keep_tissue_in(ctype)

        self.ic50.df = self.ic50.df.ix[self.features.df.index]
        self.init()

    def read_settings(self, settings):
        """Read settings and update cancer type if set"""
        self.settings.from_json(settings)
        self.set_cancer_type(self.settings.analysis_type)

    def init(self):
        # Some preprocessing to speed up data access
        ic50_parse = self.ic50.df.copy().unstack().dropna()
        # for each drug, we store the IC50s (Y) and corresponding indices
        # of cosmic identifiers
        self.ic50_dict = dict([
            (d, {'indices': ic50_parse.ix[d].index,
             'Y': ic50_parse.ix[d].values}) for d in self.ic50.drugIds])

        # save the tissues
        self._autoset_tissue_factor()

        # and MSI (Microsatellite instability) status of the samples.
        self._autoset_msi_factor()

        # and (growth) media factor
        self._autoset_media_factor()

        # dictionaries to speed up code.
        self.features_dict = {}
        self.msi_dict = {}
        self.tissue_dict = {}
        self.media_dict = {}
        # fill the dictionaries for each drug once for all
        for drug_name in self.ic50.drugIds:
            indices = self.ic50_dict[drug_name]['indices']
            # if we were to store all drugs /features, this takes
            # 1Gb of memory for 265 drugs and 680 features. This is
            # therefore not scalable, especially for multiprocessing.
            if self.settings.low_memory is True:
                pass
            else:
                self.features_dict[drug_name] = self.features.df.ix[indices]

            # MSI, media and tissue are not large data files and can be store
            if self.features.found_msi:
                self.msi_dict[drug_name] = self.msi_factor.ix[indices]
            if self.features.found_media:
                self.media_dict[drug_name] = self.media_factor.ix[indices]

            self.tissue_dict[drug_name] = self.tissue_factor.ix[indices]

        # some preprocessing for the OLS computation.
        # We create the dummies for the tissue factor once for all
        # Note that to agree with R convention, we have to resort the column
        # to agree with R convention that is a<B==b<c instead of
        # where A<B<C<a<b<c (in R)
        self._tissue_dummies = pd.get_dummies(self.tissue_factor)
        columns = self._tissue_dummies.columns
        columns = sorted(columns, key=lambda s: s.lower())
        columns = ['C(tissue)[T.' + x + ']' for x in columns]
        self._tissue_dummies.columns = columns

        if self.settings.include_media_factor:
            self._media_dummies = pd.get_dummies(self.media_factor)
            columns = self._media_dummies.columns
            columns = ['C(media)[T.' + x + ']' for x in columns]
            self._media_dummies.columns = columns
            for col in columns:
                self._tissue_dummies[col] = self._media_dummies[col]

        N = len(self._tissue_dummies)
        self._tissue_dummies['C(msi)[T.1]'] = [1]*N
        self._tissue_dummies['feature'] = [1] * N
        self._tissue_dummies.insert(0, 'Intercept', [1] * N)

        # drop first feature in the tissues that seems to be used as a
        # reference in the regression
        tissues = [x for x in self._tissue_dummies.columns if 'tissue' in x]
        self._tissue_dummies.drop(tissues[0], axis=1, inplace=True)

        if self.settings.include_media_factor:
            media = [x for x in self._tissue_dummies.columns if 'media' in x]
            self._tissue_dummies.drop(media[0], axis=1, inplace=True)

        # reset the buffer.
        self.individual_anova = {}


        if self.verbose and self._init_called is False:
            for this in ['tissue', 'media', 'msi', 'feature']:
                if this in self._get_analysis_mode():
                    print(this.upper() + " FACTOR : included")
                else:
                    print(this.upper() + " FACTOR : NOT included")
        self._init_called = True

    def _get_cosmics(self):
        return self.ic50.cosmicIds
    def _set_cosmics(self, cosmics):
        self.ic50.cosmicIds = cosmics
        self.features.cosmicIds = cosmics
        self.init()
        self.individual_anova = {}
    cosmicIds = property(_get_cosmics, _set_cosmics,
        doc="get/set the cosmic identifiers in the IC50 and feature matrices")

    def _get_drug_names(self):
        return self.ic50.drugIds
    def _set_drug_names(self, drugs):
        self.ic50.drugIds = drugs
        self.init()
        # not need to init this again ? self.individual_anova = {}
    drugIds = property(_get_drug_names, _set_drug_names,
            doc="Get/Set drug identifers")

    def _get_feature_names(self):
        shift = self.features.shift
        return self.features.features[shift:]
    def _set_features_names(self, features):
        self.features.features = features
        self.init()
        self.individual_anova = {}
    feature_names = property(_get_feature_names, _set_features_names,
            doc="Get/Set feature names")

    def _get_analysis_mode(self):
        modes = []
        if self.settings.analysis_type == 'PANCAN':
            modes.append('tissue')

        if self.settings.include_MSI_factor is True:
            modes.append('msi')

        if self.settings.include_media_factor is True:
            modes.append('media')

        modes.append('feature')
        return modes

    def diagnostics(self):
        """Return dataframe with information about the analysis

        """
        n_drugs = len(self.ic50.drugIds)
        n_features = len(self.features.features) - self.features.shift
        n_combos = n_drugs * n_features
        feasible = 0
        pb = Progress(n_drugs, 1)
        counter = 0
        for drug in self.ic50.drugIds:
            for feature in self.features.features[self.features.shift:]:
                dd = self._get_one_drug_one_feature_data(drug, feature,
                        diagnostic_only=True)
                if dd.status is True:
                    feasible += 1
            counter += 1
            pb.animate(counter)

        results = {
                'n_drug': n_drugs,
                'n_combos': n_combos,
                'feasible_tests': feasible,
                'percentage_feasible_tests': float(feasible)/n_combos*100}
        return results


    def read_drug_decode(self, filename=None):
        """Read file with the DRUG information

        .. seealso:: :class:`gdsctools.readers.DrugDecode`
        """
        # Read the DRUG decoder file into a DrugDecode/Reader instance
        self.drug_decode = readers.DrugDecode(filename)

    def __str__(self):
        txt = self.ic50.__str__()
        txt += "\n" + self.features.__str__()
        return txt

    def __repr__(self):
        txt = self.__str__()
        return txt

示例#10

0

显示文件

文件： elastic_net.py 项目： howard-lightfoot/gdsctools

    def anova_all(self, animate=True, drugs=None):
        """Run all ANOVA tests for all drugs and all features.

        :param drugs: you may select a subset of drugs
        :param animate: shows the progress bar
        :return: an :class:`~gdsctools.anova_results.ANOVAResults`
            instance with the dataframe
            stored in an attribute called **df**

        Loops over all drugs calling :meth:`anova_one_drug` for each
        drug and concatenating all results together. Note that once all
        data are gathered, an extra column containing the FDR corrections
        is added to the dataframe using :meth:`add_pvalues_correction`
        method. An extra column  named "ASSOC_ID" is also added with
        a unique identifer sorted by ascending FDR.

        .. note:: A thorough comparison with version v17 give the same FDR
            results (difference ~1e-6); Note however that the qvalue results
            differ by about 0.3% due to different smoothing in R and Python.
        """
        # drop DRUG where number of IC50 (non-null) is below 5
        # axis=0 is default but we emphasize that sum is over
        # column (i.e. drug
        vv = (self.ic50.df.isnull() == False).sum(axis=0)
        # FIXME: should be in one_drug_one_feature ??
        drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50]

        # if user provided a list of drugs, use them:
        if drugs is not None:
            # todo: check valifity of the drug names
            drug_names = drugs[:]

        pb = Progress(len(drug_names), 1)
        drug_names = list(drug_names)
        pylab.shuffle(drug_names)
        if animate is True:
            pb.animate(0)

        for i, drug_name in enumerate(drug_names):
            if drug_name in self.individual_anova.keys():
                pass
            else:
                res = self.anova_one_drug(drug_name,
                                          animate=False,
                                          output='dataframe')
                self.individual_anova[drug_name] = res
            if animate is True:
                pb.animate(i + 1)
        print("\n")
        if len(self.individual_anova) == 0:
            return ANOVAResults()

        df = pd.concat(self.individual_anova, ignore_index=True)

        if len(df) == 0:
            return df
        # sort all data by ANOVA p-values
        try:
            df.sort_values('ANOVA_FEATURE_pval', inplace=True)
        except:
            df.sort('ANOVA_FEATURE_pval', inplace=True)

        # all ANOVA have been computed individually for each drug and each
        # feature. Now, we need to compute the multiple testing corrections
        if self.settings.pvalue_correction_level == 'global':
            df = self.add_pvalues_correction(df)

        # insert a unique identifier as first column
        df.insert(0, 'ASSOC_ID', range(1, len(df) + 1))

        self.df = df
        # order the column names as defined in the __init__ method
        df = df[self.column_names]
        df.reset_index(inplace=True, drop=True)

        results = ANOVAResults()
        results.df = df
        results.settings = ANOVASettings(**self.settings)
        return results

示例#11

0

显示文件

文件： anova.py 项目： Donnyvdm/gdsctools

    def anova_one_drug(self, drug_id, animate=True, output='object'):
        """Computes ANOVA for a given drug across all features

        :param str drug_id: a valid drug identifier.
        :param animate: shows the progress bar
        :return: a dataframe

        Calls :meth:`anova_one_drug_one_feature` for each feature.
        """
        # drop first and second columns that are made of strings
        # works under python2 but not python 3. Assume that the 2 first
        #columns are the sample name and tissue feature
        # Then, we keep only cases with at least 3 features.
        # MSI could be used but is not like in original R code.
        features = self.features.df.copy()
        # need to skip the FACTOR to keep only features
        shift = self.features.shift

        features = features[features.columns[shift:]]
        # FIXME what about features with less than 3 zeros ?
        mask = features.sum(axis=0) >= 3

        # TODO: MSI, tissues, name must always be kept
        #
        selected_features = features[features.columns[mask]]

        # scan all features for a given drug
        assert drug_id in self.ic50.df.columns
        N = len(selected_features.columns)
        pb = Progress(N, 10)
        res = {}
        #
        for i, feature in enumerate(selected_features.columns):
            # production True, means we do not want to create a DataFrame
            # for each call to the anova_one_drug_one_feature function
            # Instead, we require dictionaries
            this = self.anova_one_drug_one_feature(drug_id,
                                                   feature,
                                                   production=True)
            if this['ANOVA_FEATURE_pval'] is not None:
                res[feature] = this
            if animate is True:
                pb.animate(i + 1)

        # if production is False:
        # df = pid.concat(res, ignore_index=True)
        df = pd.DataFrame.from_records(res)
        df = df.T

        df = ANOVAResults().astype(df)
        if len(df) == 0:
            return df

        # append DRUG_NAME/DRUG_TARGET columns
        df = self.drug_decode.drug_annotations(df)

        # TODO: drop rows where ANOVA_FEATURE_PVAL is None
        if output != 'object':
            df = self.add_pvalues_correction(df)
            return df
        else:
            df = self.add_pvalues_correction(df)
            res = ANOVAResults(df, self.settings)
            res.settings = ANOVASettings(**self.settings)
            return res

示例#12

0

显示文件

    def create_data_packages_for_companies(self, companies=None):
        """Creates a data package for each company found in the DrugDecode file
        """
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################

        # companies must be just one name (one string) or a list of strings
        # By default, takes all companies found in DrugDecode
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        if len(companies) == 0:
            raise ValueError(
                "Could not find any companies in the DrugDecode file")

        # The main directory
        self.mkdir(self.company_directory)

        # Loop over all companies, retrieving information built
        # in analyse() method, selecting for each TCGA all information
        # for that company only (and public drugs)
        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print(
                purple("\n=========== Analysing company %s out of %s (%s)" %
                       (ii + 1, Ncomp, company)))
            self.mkdir(self.company_directory + os.sep + company)

            # Handle each TCGA case separately
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print(brown("  ------- building TCGA %s sub directory" % tcga))

                # Read the results previously computed either
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "%s/%s/OUTPUT/results.csv" % (
                        self.main_directory, tcga)
                    results_df = ANOVAResults(results_path)

                # MAke sure the results are formatted correctly
                results = ANOVAResults(results_df)

                # Get the DrugDecode information for that company only
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)

                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # Filter the results to keep only public drugs and that
                # company. Make sure this is integers
                results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in results.df.DRUG_ID
                ]

                results.df = results.df.ix[mask]

                # We read the IC50 again
                try:
                    self.ic50 = IC50(self.ic50_filename)
                except:
                    self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)

                # And create an ANOVA instance. This is not to do the analyse
                # again but to hold various information
                an = ANOVA(self.ic50,
                           gf_filename,
                           drug_decode_company,
                           verbose=False)

                def drug_to_keep(drug):
                    to_keep = drug in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga
                an.settings.analysis_type = tcga

                # Now we create the report
                self.report = ANOVAReport(an,
                                          results,
                                          drug_decode=drug_decode_company,
                                          verbose=self.verbose)
                self.report.company = company
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)
                self.report.create_html_features()
                self.report.create_html_drugs()
                self.report.create_html_associations()

示例#13

0

显示文件

文件： gdsc.py 项目： howard-lightfoot/gdsctools

    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                  (ii + 1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in drug_ids_in_results
                ]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename,
                           drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from
                    # the analysis made in ALL
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep,
                                                        os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep, tcga,
                                                     os.sep, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        pb.animate(i + 1)