Exemplo n.º 1
0
    def __init__(
        self,
        data=None,
        row_method="complete",
        column_method="complete",
        row_metric="euclidean",
        column_metric="euclidean",
        cmap="yellow_black_blue",
        col_side_colors=None,
        row_side_colors=None,
        verbose=True,
    ):
        """.. rubric:: constructor

        :param data: a dataframe or possibly a numpy matrix.
        :param row_method: complete by default
        :param column_method: complete by default. See linkage module for details
        :param row_metric: euclidean by default
        :param column_metric: euclidean by default
        :param cmap: colormap. any matplotlib accepted or combo of colors as
            defined in colormap package (pypi)
        :param col_side_colors:
        :param row_side_colors:


        """
        # should be a copy since it may be reshuffled ?
        try:
            if data is None and verbose is True:
                print(
                    "No data provided, please fill the `df` attribute manually"
                )
            elif data is None:
                pass
            else:
                self._df = data.copy()
        except AttributeError as err:
            print("input must be a pandas data frame or numpy matrix")
            raise (err)

        self._row_method = row_method
        self._column_method = column_method

        self._column_metric = column_metric
        self._row_metric = row_metric

        # some default parameters
        self.cluster_criterion = "distance"
        self.params = easydev.AttrDict()
        self.params.col_side_colors = ["r", "g", "b", "y", "w", "k", "m"]
        self.params.row_side_colors = ["r", "g", "b", "y", "w", "k", "m"]
        self.params.cmap = cmap

        self.category_row = {}
        self.category_column = {}

        if col_side_colors:
            self.params.col_side_colors = col_side_colors
        if row_side_colors:
            self.params.row_side_colors = row_side_colors
Exemplo n.º 2
0
    def __init__(self,
                 data=None,
                 row_method='complete',
                 column_method='complete',
                 row_metric='euclidean',
                 column_metric='euclidean',
                 cmap='yellow_black_blue',
                 col_side_colors=None,
                 row_side_colors=None,
                 verbose=True):
        """.. rubric:: constructor

        :param data: a dataframe or possibly a numpy matrix.

        .. todo:: if row_method id none, no ordering in the dendogram

        """
        # should be a copy since it may be reshuffled ?
        try:
            if data is None and verbose is True:
                print(
                    "No data provided, please fill the `df` attribute manually"
                )
            else:
                self._df = data.copy()
        except AttributeError as err:
            print("input must be a pandas data frame or numpy matrix")
            raise (err)

        self._row_method = row_method
        self._column_method = column_method

        self._column_metric = column_metric
        self._row_metric = row_metric

        # some default parameters
        self.cluster_criterion = 'distance'
        self.params = easydev.AttrDict()
        self.params.col_side_colors = ['r', 'g', 'b', 'y', 'w', 'k', 'm']
        self.params.row_side_colors = ['r', 'g', 'b', 'y', 'w', 'k', 'm']
        self.params.cmap = cmap

        self.category_row = None
        self.category_column = None

        if col_side_colors:
            self.params.col_side_colors = col_side_colors
        if row_side_colors:
            self.params.row_side_colors = row_side_colors
Exemplo n.º 3
0
def _build_testing():
    testing = easydev.AttrDict()
    d = Data()
    d.filename = _gsf('test_drug_decode.tsv')
    d.description = 'drug_decode in TSV format'
    testing.drug_test_tsv = d

    d = Data()
    d.filename = _gsf('test_drug_decode.csv')
    d.description = 'drug_decode in CSV format'
    testing.drug_test_csv = d

    d = Data()
    d.filename = _gsf('test_ic50_11_50.csv')
    d.description = 'A 10drug/50 cell lines IC50 test file in CSV format'
    testing.ic50_test_csv = d

    d = Data()
    d.filename = _gsf('test_genomic_features.csv')
    d.description = 'A 50 cell lines by 20 features GenomicFeature in CSV format'
    testing.genomic_features_csv = d

    d = Data()
    d.filename = _gsf('test_IC50.csv')
    d.description = 'A 10drug/1000 cell lines IC50 test file in CSV format'
    testing.ic50_test = d
    
    d = Data()
    d.filename = _gsf('test_IC50_header2.csv')
    d.description = 'An IC50 test (header with column without Drug_ prefix)'
    testing.ic50_test_header_no_drug_prefix = d

    d = Data()
    d.filename = _gsf('test_IC50_header1.csv')
    d.description = 'An IC50 test (header with column with Drug_ prefix only)'
    testing.ic50_test_header_drug_prefix_only = d

    d = Data()
    d.filename = _gsf('test_IC50_header3.csv')
    d.description = 'An IC50 test (header with mixed prefixes i.e. Drug_ or not)'
    testing.ic50_test_header_mixed_drug_prefix = d

    d = Data()
    d.filename = _gsf('test_genomic_features_bare.csv')
    d.description = "A 50 cell lines by 17 features without MSI/tissue/sample"
    testing.genomic_features_bare_csv = d


    return testing
Exemplo n.º 4
0
    def __init__(self,
                 data=None,
                 method='complete',
                 metric='euclidean',
                 cmap='yellow_black_blue',
                 col_side_colors=None,
                 side_colors=None,
                 verbose=True,
                 horizontal=True):
        """.. rubric:: constructor

        :param data: a dataframe or possibly a numpy matrix.
        :param method: complete by default
        :param metric: euclidean by default
        :param cmap: colormap. any matplotlib accepted or combo of colors as
            defined in colormap package (pypi)
        :param col_side_colors:
        :param side_colors:

        """
        # should be a copy since it may be reshuffled ?
        try:
            if data is None and verbose is True:
                print(
                    "No data provided, please fill the `df` attribute manually"
                )
            elif data is None:
                pass
            else:
                self._df = data.copy()
        except AttributeError as err:
            print("input must be a pandas data frame or numpy matrix")
            raise (err)

        self._method = method
        self._metric = metric
        self.horizontal = True

        # some default parameters
        self.cluster_criterion = 'distance'
        self.params = easydev.AttrDict()
        self.params.side_colors = ['r', 'g', 'b', 'y', 'w', 'k', 'm']
        self.params.cmap = cmap

        self.category = {}

        if side_colors:
            self.params.side_colors = side_colors
Exemplo n.º 5
0
class GenomicFeatures(Reader, CosmicRows):
    """Read Matrix with Genomic Features

    These are the compulsary column names required (note the spaces):

        - 'COSMIC_ID'
        - 'TISSUE_FACTOR'
        - 'MSI_FACTOR'

    If one of the following column is found, it is removed (deprecated)::

        - 'SAMPLE_NAME'
        - 'Sample Name'
        - 'CELL_LINE'

    and features can be also encoded with the following convention:

        - columns ending in "_mut" to encode a gene mutation (e.g., BRAF_mut)
        - columns starting with "gain_cna"
        - columns starting with "loss_cna"

    Those columns will be removed:

        - starting with `Drug_`, which are supposibly from the IC50 matrix

    ::

        >>> from gdsctools import GenomicFeatures
        >>> gf = GenomicFeatures()
        >>> print(gf)
        Genomic features distribution
        Number of unique tissues 27
        Number of unique features 677 with
        - Mutation: 270
        - CNA (gain): 116
        - CNA (loss): 291

    .. versionchanged:: 0.9.10
        The header's columns' names have changed to be more consistant.
        Previous names are deprecated but still accepted.

    .. versionchanged:: 0.9.15
        If a tissue is empty, it is replaced by UNDEFINED.
        We also strip the spaces to make sure there is "THIS" and "THIS " are
        the same.

    """
    colnames = easydev.AttrDict()
    colnames.cosmic = 'COSMIC_ID'
    colnames.tissue = 'TISSUE_FACTOR'
    colnames.msi = 'MSI_FACTOR'
    colnames.media = 'MEDIA_FACTOR'

    def __init__(self, filename=None, empty_tissue_name="UNDEFINED"):
        """.. rubric:: Constructor

        If no file is provided, using the default file provided in the
        package that is made of 1001 cell lines times 680 features.

        :param str empty_tissue_name: if a tissue name is let empty, replace
            it with this string.

        """
        # first reset the filename to the shared data (if not provided)
        if filename is None:
            from gdsctools.datasets import genomic_features
            filename = genomic_features
        # used in the header so should be ser before call to super()

        super(GenomicFeatures, self).__init__(filename)

        # FIXME Remove columns related to Drug if any. Can be removed in
        # the future
        self.df = self.df[[
            x for x in self.df.columns if x.startswith('Drug_') is False
        ]]

        for this in ['Sample Name', 'SAMPLE_NAME', 'Sample_Name', 'CELL_LINE']:
            if this in self.df.columns:
                self.df.drop(this, axis=1, inplace=True)

        # Let us rename "COSMIC ID" into "COSMIC_ID" if needed
        for old, new in {
                'Tissue Factor Value': 'TISSUE_FACTOR',
                'MS-instability Factor Value': 'MSI_FACTOR',
                'COSMIC ID': 'COSMIC_ID'
        }.items():
            if old in self.df.columns:
                colorlog.warning(
                    "'%s' column name is deprecated " % old +
                    " since 0.9.10. Please replace with '%s'" % new,
                    DeprecationWarning)
                self.df.columns = [
                    x.replace(old, new) for x in self.df.columns
                ]
        if "CL" in self.df.columns and "COSMID_ID" not in self.df.columns:
            self.df.columns = [
                x.replace("CL", "COSMIC_ID") for x in self.df.columns
            ]

        # There are 3 special columns to hold the factors
        self._special_names = []

        # If tissue factor is not provided, we create and fill it with dummies.
        # OTherwise, we need to change a lot in the original code in ANOVA
        if self.colnames.tissue not in self.df.columns:
            colorlog.warning(
                "column named '%s' not found" % self.colnames.tissue,
                UserWarning)
            self.df[self.colnames.tissue] = ['UNDEFINED'] * len(self.df)
            self._special_names.append(self.colnames.tissue)
        else:
            self._special_names.append(self.colnames.tissue)

        self.found_msi = self.colnames.msi in self.df.columns
        if self.found_msi is False:
            colorlog.warning("column named '%s' not found" % self.colnames.msi)
        else:
            self._special_names.append(self.colnames.msi)

        self.found_media = self.colnames.media in self.df.columns
        if self.found_media is False:
            pass
            #colorlog.warning("column named '%s' not found" % self.colnames.media)
        else:
            self._special_names.append(self.colnames.media)

        # order columns and index
        self._order()

        #
        self._interpret_cosmic()

        #
        self.check()

        self._fix_empty_tissues(empty_tissue_name)

    def _fix_empty_tissues(self, name="UNDEFINED"):
        # Sometimes, tissues may be empty so a nan is present. This lead to
        # to errors in ANOVA or Regression so we replace them with "UNDEFINED"
        N = self.df.TISSUE_FACTOR.isnull().sum()
        if N > 0:
            logger.warning(
                "Some tissues were empty strings and renamed as UNDEFINED!")
        self.df.TISSUE_FACTOR.fillna('UNDEFINED', inplace=True)

    def _get_shift(self):
        return len(self._special_names)

    shift = property(_get_shift)

    def _interpret_cosmic(self):
        if self.colnames.cosmic in self.df.columns:
            self.df.set_index(self.colnames.cosmic, inplace=True)
        elif self.colnames.cosmic == self.df.index.name:
            pass
        else:
            error_msg = "the features input file must contains a column " +\
                " named %s" % self.colnames.cosmic
            raise ValueError(error_msg)
        self.df.index = [int(x) for x in self.df.index]
        self.df.index = self.df.index.astype(int)
        self.df.index.name = "COSMIC_ID"
        self.df.sort_index(inplace=True)

    def fill_media_factor(self):
        """Given the COSMIC identifiers, fills the MEDIA_FACTOR column

        If already populated, replaced by new content.

        """
        from gdsctools import COSMICInfo
        c = COSMICInfo()
        self.df['MEDIA_FACTOR'] = [
            c.get(x).SCREEN_MEDIUM for x in self.df.index
        ]
        self.found_media = True
        if self.colnames.media not in self._special_names:
            self._special_names.append(self.colnames.media)
        self._order()

    def _order(self):
        others = [x for x in self.df.columns if x not in self._special_names]
        self.df = self.df[self._special_names + others]

    def _get_features(self):
        return list(self.df.columns)

    def _set_features(self, features):
        for feature in features:
            if feature not in self.features:
                raise ValueError('Unknown feature name %s' % feature)
        features = [x for x in features if x.endswith('FACTOR') is False]
        features = self._special_names + features
        self.df = self.df[features]
        self._order()

    features = property(_get_features,
                        _set_features,
                        doc="return list of features")

    def _get_tissues(self):
        return list(self.df[self.colnames.tissue])

    tissues = property(_get_tissues, doc='return list of tissues')

    def _get_unique_tissues(self):
        return list(self.df[self.colnames.tissue].unique())

    unique_tissues = property(_get_unique_tissues, doc='return set of tissues')

    def plot(self):
        """Histogram of the tissues found

        .. plot::
            :include-source:
            :width: 80%

            from gdsctools import GenomicFeatures
            gf = GenomicFeatures() # use the default file
            gf.plot()


        """
        if self.colnames.tissue not in self.df.columns:
            return
        data = pd.get_dummies(self.df[self.colnames.tissue]).sum()
        data.index = [x.replace("_", " ") for x in data.index]
        # deprecated but works for python 3.3
        try:
            data.sort_values(ascending=False)
        except:
            data.sort(ascending=False)
        pylab.figure(1)
        pylab.clf()
        labels = list(data.index)
        pylab.pie(data, labels=labels)
        pylab.figure(2)
        data.plot(kind='barh')
        pylab.grid()
        pylab.xlabel('Occurences')

        # keep the try to prevent MacOS issue
        try:
            pylab.tight_layout()
        except:
            pass
        return data

    def __str__(self):
        txt = 'Genomic features distribution\n'
        try:
            tissues = list(self.df[self.colnames.tissue].unique())
            Ntissue = len(tissues)
            txt += 'Number of unique tissues {0}'.format(Ntissue)
            if Ntissue == 1:
                txt += ' ({0})\n'.format(tissues[0])
            elif Ntissue < 10:
                txt += '\nHere are the tissues: '
                txt += ",".join(tissues) + "\n"
            else:
                txt += '\nHere are the first 10 tissues: '
                txt += ", ".join(tissues[0:10]) + "\n"
        except:
            txt += 'No information about tissues\n'

        if self.found_msi:
            txt += "MSI column: yes\n"
        else:
            txt += "MSI column: no\n"

        if self.found_media:
            txt += "MEDIA column: yes\n"
        else:
            txt += "MEDIA column: no\n"

        # -3 since we have also the MSI, tissue, media columns
        # TODO should use shift attribute ?
        Nfeatures = len(self.features)
        txt += '\nThere are {0} unique features distributed as\n'.format(
            Nfeatures - self.shift)

        n_mutations = len([x for x in self.df.columns if x.endswith("_mut")])
        txt += "- Mutation: {}\n".format(n_mutations)

        n_gain = len([x for x in self.df.columns if x.startswith("gain_cna")])
        txt += "- CNA (gain): {}\n".format(n_gain)
        n_loss = len([x for x in self.df.columns if x.startswith("loss_cna")])
        txt += "- CNA (loss): {}".format(n_loss)
        return txt

    def drop_tissue_in(self, tissues):
        """Drop tissues from the list

        :param list tissues: a list of tissues to drop. If you have only
            one tissue, can be provided as a string. Since rows are removed
            some features (columns) may now be empty (all zeros). If so, those
            columns are dropped (except for the special columns (e.g, MSI).

        """
        tissues = easydev.to_list(tissues)
        mask = self.df[self.colnames.tissue].isin(tissues) == False
        self.df = self.df[mask]
        self._cleanup()

    def keep_tissue_in(self, tissues):
        """Drop tissues not in the list

        :param list tissues: a list of tissues to keep. If you have only
            one tissue, can be provided as a string. Since rows are removed
            some features (columns) may now be empty (all zeros). If so, those
            columns are dropped (except for the special columns (e.g, MSI).

        """
        tissues = easydev.to_list(tissues)
        mask = self.df[self.colnames.tissue].isin(tissues)
        self.df = self.df[mask]
        self._cleanup()

    def _cleanup(self, required_features=0):
        # FIXME: there is view/copy warning here in pandas. it should be fixed
        # or may have side-effects
        to_ignore = self._special_names
        # create a view ignoring the informative columns
        view = self.df[[x for x in self.df.columns if x not in to_ignore]]

        todrop = list(view.columns[view.sum() <= required_features])

        self.df.drop(todrop, axis=1, inplace=True)

    def __repr__(self):
        Nc = len(self.cosmicIds)
        Nf = len(self.features) - self.shift
        try:
            Nt = len(set(self.tissues))
        except:
            Nt = '?'
        return "GenomicFeatures <Nc={0}, Nf={1}, Nt={2}>".format(Nc, Nf, Nt)

    def compress_identical_features(self):
        """Merge duplicated columns/features

        Columns duplicated are merged as follows. Fhe first column is kept,
        others are dropped but to keep track of those dropped, the column name
        is renamed by concatenating the columns's names. The separator is a
        double underscore.

        ::

            gf = GenomicFeatures()
            gf.compress_identical_features()
            # You can now access to the column as follows (arbitrary example)
            gf.df['ARHGAP26_mut__G3BP2_mut']
        """

        # let us identify the duplicates as True/False
        datatr = self.df.transpose()
        duplicated_no_first = datatr[datatr.duplicated()]
        try:
            duplicated = datatr[datatr.duplicated(keep=False)]
        except:
            # pandas 0.16
            duplicated = datatr[datatr.duplicated(take_last=False)]

        tokeep = [
            x for x in duplicated.index if x not in duplicated_no_first.index
        ]

        # Let us create a groupby strategy
        groups = {}
        # Let us now add the corrsponding duplicats
        for feature in tokeep:
            # Find all row identical to this feature
            matches = (duplicated.ix[feature] == duplicated).all(axis=1)
            groups[feature] = "__".join(duplicated.index[matches])

        # This drops all duplicated columns (the first is kept, others are
        # dropped)
        self.df = self.df.transpose().drop_duplicates().transpose()
        self.df.rename(columns=groups, inplace=True)
        # We want to keep the column names informative that is if there were
        # duplicates, we rename the column kept with the concatenation of all
        # the corresponding duplicates
        print("compressed %s groups of duplicates" % len(groups))
        return groups

    def get_TCGA(self):
        from gdsctools.cosmictools import COSMICInfo
        c = COSMICInfo()
        tcga = c.df.ix[self.df.index].TCGA
        return tcga