示例#1
0
    def fit(self, X, **kwargs):
        """
        Sets up X for the histogram and checks to
        ensure that X is of the correct data type
        Fit calls draw
        Parameters
        ----------
        X : ndarray or DataFrame of shape n x 1
            A matrix of n instances with 1 feature
        kwargs: dict
            keyword arguments passed to Scikit-Learn API.
        """

        #throw an error if X has more than 1 column
        if is_dataframe(X):
            nrows, ncols = X.shape

            if ncols > 1:
                raise YellowbrickValueError((
                    "X needs to be an ndarray or DataFrame with one feature, "
                    "please select one feature from the DataFrame"
                ))

        # Handle the feature name if it is None.
        if self.feature is None:

            # If X is a data frame, get the columns off it.
            if is_dataframe(X):
                self.feature = X.columns

            else:
                self.feature = ['x']

        self.draw(X)
        return self
示例#2
0
    def fit(self, X, y, **kwargs):
        """
        Sets up the X and y variables for the jointplot
        and checks to ensure that X and y are of the
        correct data type

        Fit calls draw

        Parameters
        ----------

        X : ndarray or DataFrame of shape n x 1
            A matrix of n instances with 1 feature

        y : ndarray or Series of length n
            An array or series of the target value

        kwargs: dict
            keyword arguments passed to Scikit-Learn API.
        """

        #throw an error if X has more than 1 column
        if is_dataframe(X):
            nrows, ncols = X.shape

            if ncols > 1:
                raise YellowbrickValueError((
                    "X needs to be an ndarray or DataFrame with one feature, "
                    "please select one feature from the DataFrame"
                ))

        #throw an error is y is None
        if y is None:
            raise YellowbrickValueError((
                "Joint plots are useful for classification and regression "
                "problems, which require a target variable"
            ))


        # Handle the feature name if it is None.
        if self.feature is None:

            # If X is a data frame, get the columns off it.
            if is_dataframe(X):
                self.feature = X.columns

            else:
                self.feature = ['x']

        # Handle the target name if it is None.
        if self.target is None:
            self.target = ['y']

        self.draw(X, y, **kwargs)
        return self
示例#3
0
    def fit(self, X, y=None, **kwargs):
        """
        The fit method is the primary drawing input for the
        visualization since it has both the X and y data required for the
        viz and the transform method does not.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        Returns
        -------
        self : instance
            Returns the instance of the transformer/visualizer
        """
        if is_dataframe(X):
            self.X = X.values
            if self.features_ is None:
                self.features_ = X.columns
        else:
            self.X = X

        self.y = y

        super(MissingDataVisualizer, self).fit(X, y, **kwargs)
示例#4
0
    def fit(self, X, y=None, **kwargs):
        """
        The fit method is the primary drawing input for the
        visualization since it has both the X and y data required for the
        viz and the transform method does not.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        Returns
        -------
        self : instance
            Returns the instance of the transformer/visualizer
        """
        if is_dataframe(X):
            self.X = X.values
            if self.features_ is None:
                self.features_ = X.columns
        else:
            self.X = X

        self.y = y

        super(MissingDataVisualizer, self).fit(X, y, **kwargs)
示例#5
0
    def rank(self, X, algorithm=None):
        """
        Returns the feature ranking.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        algorithm : str or None
            The ranking mechanism to use, or None for the default

        Returns
        -------
        ranks : ndarray
            An n-dimensional, symmetric array of rank scores, where n is the
            number of features. E.g. for 1D ranking, it is (n,), for a
            2D ranking it is (n,n) and so forth.
        """
        algorithm = algorithm or self.ranking_
        algorithm = algorithm.lower()

        if algorithm not in self.ranking_methods:
            raise YellowbrickValueError(
                "'{}' is unrecognized ranking method".format(algorithm))

        # Extract matrix from dataframe if necessary
        if is_dataframe(X):
            X = X.as_matrix()

        return self.ranking_methods[algorithm](X)
示例#6
0
    def rank(self, X, algorithm=None):
        """
        Returns the feature ranking.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        algorithm : str or None
            The ranking mechanism to use, or None for the default

        Returns
        -------
        ranks : ndarray
            An n-dimensional, symmetric array of rank scores, where n is the
            number of features. E.g. for 1D ranking, it is (n,), for a
            2D ranking it is (n,n) and so forth.
        """
        algorithm = algorithm or self.ranking_
        algorithm = algorithm.lower()

        if algorithm not in self.ranking_methods:
            raise YellowbrickValueError(
                "'{}' is unrecognized ranking method".format(algorithm)
            )

        # Extract matrix from dataframe if necessary
        if is_dataframe(X):
            X = X.as_matrix()

        return self.ranking_methods[algorithm](X)
示例#7
0
    def fit(self, X, y=None, **kwargs):
        """
        The fit method is the primary drawing input for the parallel coords
        visualization since it has both the X and y data required for the
        viz and the transform method does not.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with 2 features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        Returns
        -------
        self : instance
            Returns the instance of the transformer/visualizer
        """
        _, ncols = X.shape

        if ncols == 2:
            X_two_cols = X
            if self.features_ is None:
                self.features_ = ["Feature One", "Feature Two"]

        # Handle the feature names if they're None.
        elif self.features_ is not None and is_dataframe(X):
            X_two_cols = X[self.features_].as_matrix()

        # handle numpy named/ structured array
        elif self.features_ is not None and is_structured_array(X):
            X_selected = X[self.features_]
            X_two_cols = X_selected.view((np.float64, len(X_selected.dtype.names)))

        # handle features that are numeric columns in ndarray matrix
        elif self.features_ is not None and has_ndarray_int_columns(self.features_, X):
            f_one, f_two = self.features_
            X_two_cols = X[:, [int(f_one), int(f_two)]]

        else:
            raise YellowbrickValueError("""
                ScatterVisualizer only accepts two features, please
                explicitly set these two features in the init kwargs or
                pass a matrix/ dataframe in with only two columns.""")

        # Store the classes for the legend if they're None.
        if self.classes_ is None:
            # TODO: Is this the most efficient method?
            self.classes_ = [str(label) for label in np.unique(y)]

        # Draw the instances
        self.draw(X_two_cols, y, **kwargs)

        # Fit always returns self.
        return self
示例#8
0
    def fit(self, X, y=None, **kwargs):
        """
        The fit method is the primary drawing input for the parallel coords
        visualization since it has both the X and y data required for the
        viz and the transform method does not.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with 2 features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        Returns
        -------
        self : instance
            Returns the instance of the transformer/visualizer
        """
        _, ncols = X.shape

        if ncols == 2:
            X_two_cols = X
            if self.features_ is None:
                self.features_ = ["Feature One", "Feature Two"]

        # Handle the feature names if they're None.
        elif self.features_ is not None and is_dataframe(X):
            X_two_cols = X[self.features_].as_matrix()

        # handle numpy named/ structured array
        elif self.features_ is not None and is_structured_array(X):
            X_selected = X[self.features_]
            X_two_cols = X_selected.copy().view((np.float64, len(X_selected.dtype.names)))

        # handle features that are numeric columns in ndarray matrix
        elif self.features_ is not None and has_ndarray_int_columns(self.features_, X):
            f_one, f_two = self.features_
            X_two_cols = X[:, [int(f_one), int(f_two)]]

        else:
            raise YellowbrickValueError("""
                ScatterVisualizer only accepts two features, please
                explicitly set these two features in the init kwargs or
                pass a matrix/ dataframe in with only two columns.""")

        # Store the classes for the legend if they're None.
        if self.classes_ is None:
            # TODO: Is this the most efficient method?
            self.classes_ = [str(label) for label in np.unique(y)]

        # Draw the instances
        self.draw(X_two_cols, y, **kwargs)

        # Fit always returns self.
        return self
示例#9
0
    def fit(self, X, y=None, **kwargs):
        """
        Fits the estimator to discover the feature importances described by
        the data, then draws those importances as a bar plot.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Keyword arguments passed to the fit method of the estimator.

        Returns
        -------
        self : visualizer
            The fit method must always return self to support pipelines.
        """
        super(FeatureImportances, self).fit(X, y, **kwargs)

        # Get the feature importances from the model
        self.feature_importances_ = self._find_importances_param()

        # Apply absolute value filter before normalization
        if self.absolute:
            self.feature_importances_ = np.abs(self.feature_importances_)

        # Normalize features relative to the maximum
        if self.relative:
            maxv = self.feature_importances_.max()
            self.feature_importances_ /= maxv
            self.feature_importances_ *= 100.0

        # Create labels for the feature importances
        # NOTE: this code is duplicated from MultiFeatureVisualizer
        if self.labels is None:
            # Use column names if a dataframe
            if is_dataframe(X):
                self.features_ = np.array(X.columns)

            # Otherwise use the column index as the labels
            else:
                _, ncols = X.shape
                self.features_ = np.arange(0, ncols)
        else:
            self.features_ = np.array(self.labels)

        # Sort the features and their importances
        sort_idx = np.argsort(self.feature_importances_)
        self.features_ = self.features_[sort_idx]
        self.feature_importances_ = self.feature_importances_[sort_idx]

        # Draw the feature importances
        self.draw()
        return self
示例#10
0
    def fit(self, X, y=None, **kwargs):
        """
        The fit method is the primary drawing input for the parallel coords
        visualization since it has both the X and y data required for the
        viz and the transform method does not.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        Returns
        ------
        self : instance
            Returns the instance of the transformer/visualizer
        """
        # TODO: This class is identical to the Parallel Coordinates version,
        # so hoist this functionality to a higher level class that is extended
        # by both RadViz and ParallelCoordinates.

        # Get the shape of the data
        nrows, ncols = X.shape

        # Store the classes for the legend if they're None.
        if self.classes_ is None:
            # TODO: Is this the most efficient method?
            self.classes_ = [str(label) for label in set(y)]

        # Handle the feature names if they're None.
        if self.features_ is None:

            # If X is a data frame, get the columns off it.
            if is_dataframe(X):
                self.features_ = X.columns

            # Otherwise create numeric labels for each column.
            else:
                self.features_ = [str(cdx) for cdx in range(ncols)]

        # Draw the instances
        self.draw(X, y, **kwargs)

        # Fit always returns self.
        return self
示例#11
0
    def fit(self, X, y=None, **kwargs):
        """
        The fit method is the primary drawing input for the parallel coords
        visualization since it has both the X and y data required for the
        viz and the transform method does not.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        Returns
        -------
        self : instance
            Returns the instance of the transformer/visualizer
        """
        # Get the shape of the data
        nrows, ncols = X.shape

        # Store the classes for the legend if they're None.
        if self.classes_ is None:
            # TODO: Is this the most efficient method?
            self.classes_ = [str(label) for label in set(y)]

        # Handle the feature names if they're None.
        if self.features_ is None:

            # If X is a data frame, get the columns off it.
            if is_dataframe(X):
                self.features_ = X.columns

            # Otherwise create numeric labels for each column.
            else:
                self.features_ = [
                    str(cdx) for cdx in range(ncols)
                ]

        # Draw the instances
        self.draw(X, y, **kwargs)

        # Fit always returns self.
        return self
示例#12
0
    def _create_labels_for_features(self, X):
        """
        Create labels for the features

        NOTE: this code is duplicated from MultiFeatureVisualizer
        """
        if self.labels is None:
            # Use column names if a dataframe
            if is_dataframe(X):
                self.features_ = np.array(X.columns)
            # Otherwise use the column index as the labels
            else:
                _, ncols = X.shape
                self.features_ = np.arange(0, ncols)
        else:
            self.features_ = np.array(self.labels)
    def _create_labels_for_features(self, X):
        """
        Create labels for the features

        NOTE: this code is duplicated from MultiFeatureVisualizer
        """
        if self.labels is None:
            # Use column names if a dataframe
            if is_dataframe(X):
                self.features_ = np.array(X.columns)
            # Otherwise use the column index as the labels
            else:
                _, ncols = X.shape
                self.features_ = np.arange(0, ncols)
        else:
            self.features_ = np.array(self.labels)
示例#14
0
    def fit(self, X, y=None, **kwargs):
        """
        The fit method is the primary drawing input for the
        visualization since it has both the X and y data required for the
        viz and the transform method does not.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        Returns
        -------
        self : instance
            Returns the instance of the transformer/visualizer
        """
        # Determine the features, classes, and colors
        super(ParallelCoordinates, self).fit(X, y)

        # Convert from pandas data types
        if is_dataframe(X):
            X = X.values
        if is_series(y):
            y = y.values

        # Ticks for each feature specified
        self._increments = np.arange(len(self.features_))

        # Subsample instances
        X, y = self._subsample(X, y)

        # Normalize instances
        if self.normalize is not None:
            X = self.NORMALIZERS[self.normalize].fit_transform(X)

        self.draw(X, y, **kwargs)
        return self
示例#15
0
    def fit(self, X, y=None):
        """
        This method performs preliminary computations in order to set up the
        figure or perform other analyses. It can also call drawing methods in
        order to set up various non-instance related figure elements.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Feature dataset to be transformed.

        y : array-like, shape (n_samples,)
            Optional dependent target data associated with X.

        Returns
        -------
        self : MultiFeatureVisualizer
            Returns the visualizer/transformer for use in Pipelines and chaining.
        """
        n_columns = X.shape[1]

        if self.features is not None:
            # Use the user-specified features with some checking
            # TODO: allow the user specified features to filter the dataset
            if len(self.features) != n_columns:
                raise YellowbrickValueError((
                    "number of supplied feature names does not match the number "
                    "of columns in the training data."))

            self.features_ = np.array(self.features)

        else:
            # Attempt to determine the feature names from the input data
            if is_dataframe(X):
                self.features_ = np.array(X.columns)

            # Otherwise create numeric labels for each column.
            else:
                self.features_ = np.arange(0, n_columns)

        # Ensure super is called and fit is returned
        super(MultiFeatureVisualizer, self).fit(X, y)
        return self
示例#16
0
    def fit(self, X, y=None, **kwargs):
        """
        The fit method gathers information about the state of the visualizer.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        Returns
        -------
        self : instance
            Returns the instance of the transformer/visualizer
        """

        # TODO: This class is identical to the Parallel Coordinates version,
        # so hoist this functionality to a higher level class that is extended
        # by both RadViz and ParallelCoordinates.

        # Get the shape of the data
        nrows, ncols = X.shape

        # Handle the feature names if they're None.
        if self.features_ is None:

            # If X is a data frame, get the columns off it.
            if is_dataframe(X):
                self.features_ = X.columns

            # Otherwise create numeric labels for each column.
            else:
                self.features_ = [str(cdx) for cdx in range(ncols)]

        # Fit always returns self.
        return self
示例#17
0
    def fit(self, X, y=None, **kwargs):
        """
        The fit method is the primary drawing input for the
        visualization since it has both the X and y data required for the
        viz and the transform method does not.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        Returns
        -------
        self : instance
            Returns the instance of the transformer/visualizer
        """
        # Do not call super here - the data visualizer has been refactored
        # to provide increased functionality that is not yet compatible with
        # the current implementation. This mimicks the previous functionality.
        # TODO: Refactor MissingDataVisualizer to make use of new features.
        self.features_ = self.features

        if is_dataframe(X):
            self.X = X.values
            if self.features_ is None:
                self.features_ = X.columns
        else:
            self.X = X

        self.y = y

        self.draw(X, y, **kwargs)
        return self
示例#18
0
    def fit(self, X, y=None, **fit_params):
        """
        This method performs preliminary computations in order to set up the
        figure or perform other analyses. It can also call drawing methods in
        order to set up various non-instance related figure elements.

        This method must return self.
        """

        # Handle the feature names if they're None.
        if self.features_ is None:

            # If X is a data frame, get the columns off it.
            if is_dataframe(X):
                self.features_ = np.array(X.columns)

            # Otherwise create numeric labels for each column.
            else:
                _, ncols = X.shape
                self.features_ = np.arange(0, ncols)

        return self
示例#19
0
    def fit(self, X, y=None, **fit_params):
        """
        This method performs preliminary computations in order to set up the
        figure or perform other analyses. It can also call drawing methods in
        order to set up various non-instance related figure elements.

        This method must return self.
        """

        # Handle the feature names if they're None.
        if self.features_ is None:

            # If X is a data frame, get the columns off it.
            if is_dataframe(X):
                self.features_ = np.array(X.columns)

            # Otherwise create numeric labels for each column.
            else:
                _, ncols = X.shape
                self.features_ = np.arange(0, ncols)

        return self
示例#20
0
    def _select_feature_columns(self, X):
        """ """

        if len(X.shape) == 1:
            X_flat = X.view(np.float64).reshape(len(X), -1)
        else:
            X_flat = X

        _, ncols = X_flat.shape

        if ncols == 2:
            X_two_cols = X
            if self.features_ is None:
                self.features_ = ["Feature One", "Feature Two"]

        # Handle the feature names if they're None.
        elif self.features_ is not None and is_dataframe(X):
            X_two_cols = X[self.features_].as_matrix()

        # handle numpy named/ structured array
        elif self.features_ is not None and is_structured_array(X):
            X_selected = X[self.features_]
            X_two_cols = X_selected.view(np.float64).reshape(
                len(X_selected), -1)

        # handle features that are numeric columns in ndarray matrix
        elif self.features_ is not None and has_ndarray_int_columns(
                self.features_, X):
            f_one, f_two = self.features_
            X_two_cols = X[:, [int(f_one), int(f_two)]]

        else:
            raise YellowbrickValueError("""
                ScatterVisualizer only accepts two features, please
                explicitly set these two features in the init kwargs or
                pass a matrix/ dataframe in with only two columns.""")

        return X_two_cols
示例#21
0
    def _select_feature_columns(self, X):
        """ """

        if len(X.shape) == 1:
            X_flat = X.copy().view(np.float64).reshape(len(X), -1)
        else:
            X_flat = X

        _, ncols = X_flat.shape

        if ncols == 2:
            X_two_cols = X
            if self.features_ is None:
                self.features_ = ["Feature One", "Feature Two"]

        # Handle the feature names if they're None.
        elif self.features_ is not None and is_dataframe(X):
            X_two_cols = X[self.features_].as_matrix()

        # handle numpy named/ structured array
        elif self.features_ is not None and is_structured_array(X):
            X_selected = X[self.features_]
            X_two_cols = X_selected.copy().view(np.float64).reshape(len(X_selected), -1)

        # handle features that are numeric columns in ndarray matrix
        elif self.features_ is not None and has_ndarray_int_columns(self.features_, X):
            f_one, f_two = self.features_
            X_two_cols = X[:, [int(f_one), int(f_two)]]

        else:
            raise YellowbrickValueError("""
                ScatterVisualizer only accepts two features, please
                explicitly set these two features in the init kwargs or
                pass a matrix/ dataframe in with only two columns.""")

        return X_two_cols
示例#22
0
    def draw(self, X, y, **kwargs):
        """
        Called from the fit method, this method creates the parallel
        coordinates canvas and draws each instance and vertical lines on it.
        """
        # Convert from dataframe
        if is_dataframe(X):
            X = X.as_matrix()

        # Choose a subset of samples
        # TODO: allow selection of a random subset of samples instead of head

        if isinstance(self.sample, int):
            self.n_samples = min([self.sample, len(X)])
        elif isinstance(self.sample, float):
            self.n_samples = int(len(X) * self.sample)
        X = X[:self.n_samples, :]

        # Normalize
        if self.normalize is not None:
            X = self.normalizers[self.normalize].fit_transform(X)

        # Get the shape of the data
        nrows, ncols = X.shape

        # Create the xticks for each column
        # TODO: Allow the user to specify this feature
        x = list(range(ncols))

        # Create the colors
        # TODO: Allow both colormap, listed colors, and palette definition
        # TODO: Make this an independent function or property for override!
        color_values = resolve_colors(
            n_colors=len(self.classes_), colormap=self.colormap, colors=self.color
        )
        colors = dict(zip(self.classes_, color_values))

        # Track which labels are already in the legend
        used_legends = set([])

        # TODO: Make this function compatible with DataFrames!
        # TODO: Make an independent function to allow addition of instances!
        for idx, row in enumerate(X):
            # TODO: How to map classmap to labels?
            label = y[idx] # Get the label for the row
            label = self.classes_[label]

            if label not in used_legends:
                used_legends.add(label)
                self.ax.plot(x, row, color=colors[label], alpha=0.25, label=label, **kwargs)
            else:
                self.ax.plot(x, row, color=colors[label], alpha=0.25, **kwargs)

        # Add the vertical lines
        # TODO: Make an independent function for override!
        if self.show_vlines:
            for idx in x:
                self.ax.axvline(idx, **self.vlines_kwds)

        # Set the limits
        self.ax.set_xticks(x)
        self.ax.set_xticklabels(self.features_)
        self.ax.set_xlim(x[0], x[-1])
示例#23
0
    def draw(self, X, y, **kwargs):
        """
        Called from the fit method, this method creates the radviz canvas and
        draws each instance as a class or target colored point, whose location
        is determined by the feature data set.
        """
        # Convert from dataframe
        if is_dataframe(X):
            X = X.values

        # Clean out nans and warn that the user they aren't plotted
        nan_warnings.warn_if_nans_exist(X)
        X, y = nan_warnings.filter_missing(X, y)

        # Get the shape of the data
        nrows, ncols = X.shape

        # Set the axes limits
        self.ax.set_xlim([-1,1])
        self.ax.set_ylim([-1,1])

        # Create the colors
        # TODO: Allow both colormap, listed colors, and palette definition
        # TODO: Make this an independent function or property for override!
        color_values = resolve_colors(
            n_colors=len(self.classes_), colormap=self.colormap, colors=self.color
        )
        self._colors = dict(zip(self.classes_, color_values))

        # Create a data structure to hold scatter plot representations
        to_plot = {}
        for kls in self.classes_:
            to_plot[kls] = [[], []]

        # Compute the arcs around the circumference for each feature axis
        # TODO: make this an independent function for override
        s = np.array([
                (np.cos(t), np.sin(t))
                for t in [
                    2.0 * np.pi * (i / float(ncols))
                    for i in range(ncols)
                ]
            ])

        # Compute the locations of the scatter plot for each class
        # Normalize the data first to plot along the 0, 1 axis
        for i, row in enumerate(self.normalize(X)):
            row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
            xy   = (s * row_).sum(axis=0) / row.sum()
            kls = self.classes_[y[i]]

            to_plot[kls][0].append(xy[0])
            to_plot[kls][1].append(xy[1])

        # Add the scatter plots from the to_plot function
        # TODO: store these plots to add more instances to later
        # TODO: make this a separate function
        for i, kls in enumerate(self.classes_):
            self.ax.scatter(
                to_plot[kls][0], to_plot[kls][1], color=self._colors[kls],
                label=str(kls), alpha=self.alpha, **kwargs
            )

        # Add the circular axis path
        # TODO: Make this a seperate function (along with labeling)
        self.ax.add_patch(patches.Circle(
            (0.0, 0.0), radius=1.0, facecolor='none', edgecolor='grey', linewidth=.5
        ))

        # Add the feature names
        for xy, name in zip(s, self.features_):
            # Add the patch indicating the location of the axis
            self.ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='#777777'))

            # Add the feature names offset around the axis marker
            if xy[0] < 0.0 and xy[1] < 0.0:
                self.ax.text(xy[0] - 0.025, xy[1] - 0.025, name, ha='right', va='top', size='small')
            elif xy[0] < 0.0 and xy[1] >= 0.0:
                self.ax.text(xy[0] - 0.025, xy[1] + 0.025, name, ha='right', va='bottom', size='small')
            elif xy[0] >= 0.0 and xy[1] < 0.0:
                self.ax.text(xy[0] + 0.025, xy[1] - 0.025, name, ha='left', va='top', size='small')
            elif xy[0] >= 0.0 and xy[1] >= 0.0:
                self.ax.text(xy[0] + 0.025, xy[1] + 0.025, name, ha='left', va='bottom', size='small')

        self.ax.axis('equal')
示例#24
0
    def fit(self, X, y=None, **kwargs):
        """
        Fits the estimator to discover the feature importances described by
        the data, then draws those importances as a bar plot.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Keyword arguments passed to the fit method of the estimator.

        Returns
        -------
        self : visualizer
            The fit method must always return self to support pipelines.
        """
        super(FeatureImportances, self).fit(X, y, **kwargs)

        # Get the feature importances from the model
        self.feature_importances_ = self._find_importances_param()

        # Get the classes from the model
        if is_classifier(self):
            self.classes_ = self._find_classes_param()
        else:
            self.classes_ = None
            self.stack = False

        # If self.stack = True and feature importances is a multidim array,
        # we're expecting a shape of (n_classes, n_features)
        # therefore we flatten by taking the average by
        # column to get shape (n_features,)  (see LogisticRegression)
        if not self.stack and self.feature_importances_.ndim > 1:
            self.feature_importances_ = np.mean(self.feature_importances_,
                                                axis=0)

        # Apply absolute value filter before normalization
        if self.absolute:
            self.feature_importances_ = np.abs(self.feature_importances_)

        # Normalize features relative to the maximum
        if self.relative:
            maxv = np.abs(self.feature_importances_).max()
            self.feature_importances_ /= maxv
            self.feature_importances_ *= 100.0

        # Create labels for the feature importances
        # NOTE: this code is duplicated from MultiFeatureVisualizer
        if self.labels is None:
            # Use column names if a dataframe
            if is_dataframe(X):
                self.features_ = np.array(X.columns)

            # Otherwise use the column index as the labels
            else:
                _, ncols = X.shape
                self.features_ = np.arange(0, ncols)
        else:
            self.features_ = np.array(self.labels)

        # Sort the features and their importances
        if self.stack:
            sort_idx = np.argsort(np.mean(self.feature_importances_, 0))
            self.features_ = self.features_[sort_idx]
            self.feature_importances_ = self.feature_importances_[:, sort_idx]
        else:
            sort_idx = np.argsort(self.feature_importances_)
            self.features_ = self.features_[sort_idx]
            self.feature_importances_ = self.feature_importances_[sort_idx]

        # Draw the feature importances
        self.draw()
        return self
示例#25
0
    def draw(self, X, y, **kwargs):
        """
        Called from the fit method, this method creates the radviz canvas and
        draws each instance as a class or target colored point, whose location
        is determined by the feature data set.
        """
        # Convert from dataframe
        if is_dataframe(X):
            X = X.values

        # Clean out nans and warn that the user they aren't plotted
        nan_warnings.warn_if_nans_exist(X)
        X, y = nan_warnings.filter_missing(X, y)

        # Get the shape of the data
        nrows, ncols = X.shape

        # Set the axes limits
        self.ax.set_xlim([-1, 1])
        self.ax.set_ylim([-1, 1])

        # Create a data structure to hold scatter plot representations
        to_plot = {label: [[], []] for label in self.classes_}

        # Compute the arcs around the circumference for each feature axis
        # TODO: make this an independent function for override
        s = np.array([
            (np.cos(t), np.sin(t))
            for t in [2.0 * np.pi * (i / float(ncols)) for i in range(ncols)]
        ])

        # Compute the locations of the scatter plot for each class
        # Normalize the data first to plot along the 0, 1 axis
        for i, row in enumerate(self.normalize(X)):
            row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
            xy = (s * row_).sum(axis=0) / row.sum()
            label = self._label_encoder[y[i]]

            to_plot[label][0].append(xy[0])
            to_plot[label][1].append(xy[1])

        # Add the scatter plots from the to_plot function
        # TODO: store these plots to add more instances to later
        # TODO: make this a separate function
        for label in self.classes_:
            color = self.get_colors([label])[0]
            self.ax.scatter(to_plot[label][0],
                            to_plot[label][1],
                            color=color,
                            label=label,
                            alpha=self.alpha,
                            **kwargs)

        # Add the circular axis path
        # TODO: Make this a seperate function (along with labeling)
        self.ax.add_patch(
            patches.Circle(
                (0.0, 0.0),
                radius=1.0,
                facecolor="none",
                edgecolor="grey",
                linewidth=0.5,
            ))

        # Add the feature names
        for xy, name in zip(s, self.features_):
            # Add the patch indicating the location of the axis
            self.ax.add_patch(
                patches.Circle(xy, radius=0.025, facecolor="#777777"))

            # Add the feature names offset around the axis marker
            if xy[0] < 0.0 and xy[1] < 0.0:
                self.ax.text(
                    xy[0] - 0.025,
                    xy[1] - 0.025,
                    name,
                    ha="right",
                    va="top",
                    size="small",
                )
            elif xy[0] < 0.0 and xy[1] >= 0.0:
                self.ax.text(
                    xy[0] - 0.025,
                    xy[1] + 0.025,
                    name,
                    ha="right",
                    va="bottom",
                    size="small",
                )
            elif xy[0] >= 0.0 and xy[1] < 0.0:
                self.ax.text(
                    xy[0] + 0.025,
                    xy[1] - 0.025,
                    name,
                    ha="left",
                    va="top",
                    size="small",
                )
            elif xy[0] >= 0.0 and xy[1] >= 0.0:
                self.ax.text(
                    xy[0] + 0.025,
                    xy[1] + 0.025,
                    name,
                    ha="left",
                    va="bottom",
                    size="small",
                )

        self.ax.axis("equal")
        return self.ax
示例#26
0
    def draw(self, X, y, **kwargs):
        """
        Called from the fit method, this method creates the radviz canvas and
        draws each instance as a class or target colored point, whose location
        is determined by the feature data set.
        """
        # Convert from dataframe
        if is_dataframe(X):
            X = X.values

        # Clean out nans and warn that the user they aren't plotted
        nan_warnings.warn_if_nans_exist(X)
        X, y = nan_warnings.filter_missing(X, y)

        # Get the shape of the data
        nrows, ncols = X.shape

        # Set the axes limits
        self.ax.set_xlim([-1, 1])
        self.ax.set_ylim([-1, 1])

        # Create the colors
        # TODO: Allow both colormap, listed colors, and palette definition
        # TODO: Make this an independent function or property for override!
        color_values = resolve_colors(n_colors=len(self.classes_),
                                      colormap=self.colormap,
                                      colors=self.color)
        self._colors = dict(zip(self.classes_, color_values))

        # Create a data structure to hold scatter plot representations
        to_plot = {}
        for kls in self.classes_:
            to_plot[kls] = [[], []]

        # Compute the arcs around the circumference for each feature axis
        # TODO: make this an independent function for override
        s = np.array([
            (np.cos(t), np.sin(t))
            for t in [2.0 * np.pi * (i / float(ncols)) for i in range(ncols)]
        ])

        # Compute the locations of the scatter plot for each class
        # Normalize the data first to plot along the 0, 1 axis
        for i, row in enumerate(self.normalize(X)):
            row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
            xy = (s * row_).sum(axis=0) / row.sum()
            kls = self.classes_[y[i]]

            to_plot[kls][0].append(xy[0])
            to_plot[kls][1].append(xy[1])

        # Add the scatter plots from the to_plot function
        # TODO: store these plots to add more instances to later
        # TODO: make this a separate function
        for i, kls in enumerate(self.classes_):
            self.ax.scatter(to_plot[kls][0],
                            to_plot[kls][1],
                            color=self._colors[kls],
                            label=str(kls),
                            alpha=self.alpha,
                            **kwargs)

        # Add the circular axis path
        # TODO: Make this a seperate function (along with labeling)
        self.ax.add_patch(
            patches.Circle((0.0, 0.0),
                           radius=1.0,
                           facecolor='none',
                           edgecolor='grey',
                           linewidth=.5))

        # Add the feature names
        for xy, name in zip(s, self.features_):
            # Add the patch indicating the location of the axis
            self.ax.add_patch(
                patches.Circle(xy, radius=0.025, facecolor='#777777'))

            # Add the feature names offset around the axis marker
            if xy[0] < 0.0 and xy[1] < 0.0:
                self.ax.text(xy[0] - 0.025,
                             xy[1] - 0.025,
                             name,
                             ha='right',
                             va='top',
                             size='small')
            elif xy[0] < 0.0 and xy[1] >= 0.0:
                self.ax.text(xy[0] - 0.025,
                             xy[1] + 0.025,
                             name,
                             ha='right',
                             va='bottom',
                             size='small')
            elif xy[0] >= 0.0 and xy[1] < 0.0:
                self.ax.text(xy[0] + 0.025,
                             xy[1] - 0.025,
                             name,
                             ha='left',
                             va='top',
                             size='small')
            elif xy[0] >= 0.0 and xy[1] >= 0.0:
                self.ax.text(xy[0] + 0.025,
                             xy[1] + 0.025,
                             name,
                             ha='left',
                             va='bottom',
                             size='small')

        self.ax.axis('equal')
示例#27
0
    def fit(self, X, y=None, **kwargs):
        """
        Fits the estimator to discover the feature importances described by
        the data, then draws those importances as a bar plot.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Keyword arguments passed to the fit method of the estimator.

        Returns
        -------
        self : visualizer
            The fit method must always return self to support pipelines.
        """
        super(FeatureImportances, self).fit(X, y, **kwargs)

        # Get the feature importances from the model
        self.feature_importances_ = self._find_importances_param()

        # If feature importances is a multidim array, we're expecting a shape of
        # (n_classes, n_features) therefore we flatten by taking the average by
        # column to get shape (n_features,)  (see LogisticRegression)
        if self.feature_importances_.ndim > 1:
            self.feature_importances_ = np.mean(self.feature_importances_,
                                                axis=0)

        # TODO - as an alternative to the above flattening approach, explore an
        # alternative visualize that uses the array shape to create a stacked bar chart
        # of feature importances for each class/feature combination

        # Apply absolute value filter before normalization
        if self.absolute:
            self.feature_importances_ = np.abs(self.feature_importances_)

        # Normalize features relative to the maximum
        if self.relative:
            maxv = self.feature_importances_.max()
            self.feature_importances_ /= maxv
            self.feature_importances_ *= 100.0

        # Create labels for the feature importances
        # NOTE: this code is duplicated from MultiFeatureVisualizer
        if self.labels is None:
            # Use column names if a dataframe
            if is_dataframe(X):
                self.features_ = np.array(X.columns)

            # Otherwise use the column index as the labels
            else:
                _, ncols = X.shape
                self.features_ = np.arange(0, ncols)
        else:
            self.features_ = np.array(self.labels)

        # Sort the features and their importances
        sort_idx = np.argsort(self.feature_importances_)
        self.features_ = self.features_[sort_idx]
        self.feature_importances_ = self.feature_importances_[sort_idx]

        # Draw the feature importances
        self.draw()
        return self
示例#28
0
    def fit(self, X, y=None, **kwargs):
        """
        Fits the estimator to discover the feature importances described by
        the data, then draws those importances as a bar plot.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Keyword arguments passed to the fit method of the estimator.

        Returns
        -------
        self : visualizer
            The fit method must always return self to support pipelines.
        """
        # Super call fits the underlying estimator if it's not already fitted
        super(FeatureImportances, self).fit(X, y, **kwargs)

        # Get the feature importances from the model
        self.feature_importances_ = self._find_importances_param()

        # Get the classes from the model
        if is_classifier(self):
            self.classes_ = self._find_classes_param()
        else:
            self.classes_ = None
            self.stack = False

        # If self.stack = True and feature importances is a multidim array,
        # we're expecting a shape of (n_classes, n_features)
        # therefore we flatten by taking the average by
        # column to get shape (n_features,)  (see LogisticRegression)
        if not self.stack and self.feature_importances_.ndim > 1:
            self.feature_importances_ = np.mean(self.feature_importances_,
                                                axis=0)
            warnings.warn(
                ("detected multi-dimensional feature importances but stack=False, "
                 "using mean to aggregate them."),
                YellowbrickWarning,
            )

        # Apply absolute value filter before normalization
        if self.absolute:
            self.feature_importances_ = np.abs(self.feature_importances_)

        # Normalize features relative to the maximum
        if self.relative:
            maxv = np.abs(self.feature_importances_).max()
            self.feature_importances_ /= maxv
            self.feature_importances_ *= 100.0

        # Create labels for the feature importances
        # NOTE: this code is duplicated from MultiFeatureVisualizer
        if self.labels is None:
            # Use column names if a dataframe
            if is_dataframe(X):
                self.features_ = np.array(X.columns)

            # Otherwise use the column index as the labels
            else:
                _, ncols = X.shape
                self.features_ = np.arange(0, ncols)
        else:
            self.features_ = np.array(self.labels)

        # Sort the features and their importances
        if self.stack:
            sort_idx = np.argsort(np.mean(self.feature_importances_, 0))
            self.features_ = self.features_[sort_idx]
            self.feature_importances_ = self.feature_importances_[:, sort_idx]
        else:
            sort_idx = np.argsort(self.feature_importances_)
            self.features_ = self.features_[sort_idx]
            self.feature_importances_ = self.feature_importances_[sort_idx]

        # Draw the feature importances
        self.draw()
        return self
示例#29
0
    def fit(self, X, y=None, **kwargs):
        """
        The fit method is the primary drawing input for the
        visualization since it has both the X and y data required for the
        viz and the transform method does not.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        Returns
        -------
        self : instance
            Returns the instance of the transformer/visualizer
        """

        # Convert from pandas data types
        if is_dataframe(X):
            # Get column names before reverting to an np.ndarray
            if self.features_ is None:
                self.features_ = np.array(X.columns)

            X = X.values
        if is_series(y):
            y = y.values

        # Assign integer labels to the feature columns from the input
        if self.features_ is None:
            self.features_ = np.arange(0, X.shape[1])

        # Ensure that all classes are represented in the color mapping (before sample)
        # NOTE: np.unique also specifies the ordering of the classes
        if self.classes_ is None:
            self.classes_ = [str(label) for label in np.unique(y)]

        # Create the color mapping for each class
        # TODO: Allow both colormap, listed colors, and palette definition
        # TODO: Make this an independent function or property for override!
        color_values = resolve_colors(n_colors=len(self.classes_),
                                      colormap=self.colormap,
                                      colors=self.color)
        self._colors = dict(zip(self.classes_, color_values))

        # Ticks for each feature specified
        self._increments = np.arange(len(self.features_))

        # Subsample instances
        X, y = self._subsample(X, y)

        # Normalize instances
        if self.normalize is not None:
            X = self.NORMALIZERS[self.normalize].fit_transform(X)

        # the super method calls draw and returns self
        return super(ParallelCoordinates, self).fit(X, y, **kwargs)