示例#1
0
文件: featureset.py 项目: nimmen/skll
class FeatureSet(object):

    """
    Encapsulation of all of the features, values, and metadata about a given
    set of data.

    This replaces ``ExamplesTuple`` from older versions.

    :param name: The name of this feature set.
    :type name: str
    :param ids: Example IDs for this set.
    :type ids: np.array
    :param labels: labels for this set.
    :type labels: np.array
    :param features: The features for each instance represented as either a
                     list of dictionaries or an array-like (if `vectorizer` is
                     also specified).
    :type features: list of dict or array-like
    :param vectorizer: Vectorizer that created feature matrix.
    :type vectorizer: DictVectorizer or FeatureHasher

    .. note::
       If ids, labels, and/or features are not None, the number of rows in
       each array must be equal.
    """

    def __init__(self, name, ids, labels=None, features=None,
                 vectorizer=None):
        super(FeatureSet, self).__init__()
        self.name = name
        if isinstance(ids, list):
            ids = np.array(ids)
        self.ids = ids
        if isinstance(labels, list):
            labels = np.array(labels)
        self.labels = labels
        self.features = features
        self.vectorizer = vectorizer
        # Convert list of dicts to numpy array
        if isinstance(self.features, list):
            if self.vectorizer is None:
                self.vectorizer = NewDictVectorizer(sparse=True)
            self.features = self.vectorizer.fit_transform(self.features)
        if self.features is not None:
            num_feats = self.features.shape[0]
            if self.ids is None:
                raise ValueError('A list of IDs is required')
            num_ids = self.ids.shape[0]
            if num_feats != num_ids:
                raise ValueError(('Number of IDs (%s) does not equal '
                                  'number of feature rows (%s)') % (num_ids,
                                                                    num_feats))
            if self.labels is None:
                self.labels = np.empty(num_feats)
                self.labels.fill(None)
            num_labels = self.labels.shape[0]
            if num_feats != num_labels:
                raise ValueError(('Number of labels (%s) does not equal '
                                  'number of feature rows (%s)') % (num_labels,
                                                                    num_feats))

    def __contains__(self, value):
        """
        Check if example ID is in set
        """
        return value in self.ids

    def __eq__(self, other):
        """
        Check whether two featuresets are the same.

        .. note::
           We consider feature values to be equal if any differences are in the
           sixth decimal place or higher.
        """

        # We need to sort the indices for the underlying
        # feature sparse matrix in case we haven't done
        # so already.
        if not self.features.has_sorted_indices:
            self.features.sort_indices()
        if not other.features.has_sorted_indices:
            other.features.sort_indices()

        return (self.ids.shape == other.ids.shape and
                self.labels.shape == other.labels.shape and
                self.features.shape == other.features.shape and
                (self.ids == other.ids).all() and
                (self.labels == other.labels).all() and
                np.allclose(self.features.data, other.features.data,
                            rtol=1e-6) and
                (self.features.indices == other.features.indices).all() and
                (self.features.indptr == other.features.indptr).all() and
                self.vectorizer == other.vectorizer)

    def __iter__(self):
        """
        Iterate through (ID, label, feature_dict) tuples in feature set.
        """
        if self.features is not None:
            if not isinstance(self.vectorizer, DictVectorizer):
                raise ValueError('FeatureSets can only be iterated through if '
                                 'they use a DictVectorizer for their feature '
                                 'vectorizer.')
            for id_, label_, feats in zip(self.ids, self.labels,
                                          self.features):
                # When calling inverse_transform we have to add [0] to get the
                # results for the current instance because it always returns a
                # 2D array
                yield (id_, label_,
                       self.vectorizer.inverse_transform(feats)[0])
        else:
            return

    def __len__(self):
        return self.features.shape[0]

    def __add__(self, other):
        """
        Combine two feature sets to create a new one.  This is done assuming
        they both have the same instances with the same IDs in the same order.
        """

        # Check that the sets of IDs are equal
        if set(self.ids) != set(other.ids):
            raise ValueError('IDs are not in the same order in each '
                             'feature set')
        # Compute the relative ordering of IDs for merging the features
        # and labels.
        ids_indices = dict((y, x) for x, y in enumerate(other.ids))
        relative_order = [ids_indices[self_id] for self_id in self.ids]

        # Initialize the new feature set with a name and the IDs.
        new_set = FeatureSet('+'.join(sorted([self.name, other.name])),
                             deepcopy(self.ids))

        # Combine feature matrices and vectorizers.
        if not isinstance(self.vectorizer, type(other.vectorizer)):
            raise ValueError('Cannot combine FeatureSets because they are '
                             'not both using the same type of feature '
                             'vectorizer (e.g., DictVectorizer, '
                             'FeatureHasher)')
        uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher)
        if uses_feature_hasher:
            if (self.vectorizer.n_features !=
                    other.vectorizer.n_features):
                raise ValueError('Cannot combine FeatureSets that uses '
                                 'FeatureHashers with different values of '
                                 'n_features setting.')
        else:
            # Check for duplicate feature names.
            if (set(self.vectorizer.feature_names_) &
                    set(other.vectorizer.feature_names_)):
                raise ValueError('Cannot combine FeatureSets because they '
                                 'have duplicate feature names.')
        num_feats = self.features.shape[1]

        new_set.features = sp.hstack([self.features,
                                      other.features[relative_order]],
                                     'csr')
        new_set.vectorizer = deepcopy(self.vectorizer)
        if not uses_feature_hasher:
            for feat_name, index in other.vectorizer.vocabulary_.items():
                new_set.vectorizer.vocabulary_[feat_name] = (index +
                                                             num_feats)
            other_names = other.vectorizer.feature_names_
            new_set.vectorizer.feature_names_.extend(other_names)

        # If either set has labels, check that they don't conflict.
        if self.has_labels:
            # labels should be the same for each FeatureSet, so store once.
            if other.has_labels and \
                    not np.all(self.labels == other.labels[relative_order]):
                raise ValueError('Feature sets have conflicting labels for '
                                 'examples with the same ID.')
            new_set.labels = deepcopy(self.labels)
        else:
            new_set.labels = deepcopy(other.labels[relative_order])

        return new_set

    def filter(self, ids=None, labels=None, features=None, inverse=False):
        """
        Removes or keeps features and/or examples from the Featureset depending
        on the passed in parameters.

        :param ids: Examples to keep in the FeatureSet. If `None`, no ID
                    filtering takes place.
        :type ids: list of str/float
        :param labels: labels that we want to retain examples for. If `None`,
                        no label filtering takes place.
        :type labels: list of str/float
        :param features: Features to keep in the FeatureSet. To help with
                         filtering string-valued features that were converted
                         to sequences of boolean features when read in, any
                         features in the FeatureSet that contain a `=` will be
                         split on the first occurrence and the prefix will be
                         checked to see if it is in `features`.
                         If `None`, no feature filtering takes place.
                         Cannot be used if FeatureSet uses a FeatureHasher for
                         vectorization.
        :type features: list of str
        :param inverse: Instead of keeping features and/or examples in lists,
                        remove them.
        :type inverse: bool
        """
        # Construct mask that indicates which examples to keep
        mask = np.ones(len(self), dtype=bool)
        if ids is not None:
            mask = np.logical_and(mask, np.in1d(self.ids, ids))
        if labels is not None:
            mask = np.logical_and(mask, np.in1d(self.labels, labels))

        if inverse and (labels is not None or ids is not None):
            mask = np.logical_not(mask)

        # Remove examples not in mask
        self.ids = self.ids[mask]
        self.labels = self.labels[mask]
        self.features = self.features[mask, :]

        # Filter features
        if features is not None:
            if isinstance(self.vectorizer, FeatureHasher):
                raise ValueError('FeatureSets with FeatureHasher vectorizers'
                                 ' cannot be filtered by feature.')
            columns = np.array(sorted({feat_num for feat_name, feat_num in
                                       iteritems(self.vectorizer.vocabulary_)
                                       if (feat_name in features or
                                           feat_name.split('=', 1)[0] in
                                           features)}))
            if inverse:
                all_columns = np.arange(self.features.shape[1])
                columns = all_columns[np.logical_not(np.in1d(all_columns,
                                                             columns))]
            self.features = self.features[:, columns]
            self.vectorizer.restrict(columns, indices=True)

    def filtered_iter(self, ids=None, labels=None, features=None,
                      inverse=False):
        """
        A version of ``__iter__`` that retains only the specified features
        and/or examples from the output.

        :param ids: Examples in the FeatureSet to keep. If `None`, no ID
                    filtering takes place.
        :type ids: list of str/float
        :param labels: labels that we want to retain examples for. If `None`,
                       no label filtering takes place.
        :type labels: list of str/float
        :param features: Features in the FeatureSet to keep. To help with
                         filtering string-valued features that were converted
                         to sequences of boolean features when read in, any
                         features in the FeatureSet that contain a `=` will be
                         split on the first occurrence and the prefix will be
                         checked to see if it is in `features`.
                         If `None`, no feature filtering takes place.
                         Cannot be used if FeatureSet uses a FeatureHasher for
                         vectorization.
        :type features: list of str
        :param inverse: Instead of keeping features and/or examples in lists,
                        remove them.
        :type inverse: bool
        """
        if self.features is not None and not isinstance(self.vectorizer,
                                                        DictVectorizer):
            raise ValueError('FeatureSets can only be iterated through if they'
                             ' use a DictVectorizer for their feature '
                             'vectorizer.')

        for id_, label_, feats in zip(self.ids, self.labels, self.features):
            # Skip instances with IDs not in filter
            if ids is not None and (id_ in ids) == inverse:
                continue
            # Skip instances with labels not in filter
            if labels is not None and (label_ in labels) == inverse:
                continue
            feat_dict = self.vectorizer.inverse_transform(feats)[0]
            if features is not None:
                feat_dict = {name: value for name, value in
                             iteritems(feat_dict) if
                             (inverse != (name in features or
                                          name.split('=', 1)[0] in features))}
            elif not inverse:
                feat_dict = {}
            yield id_, label_, feat_dict

    def __sub__(self, other):
        """
        :returns: a copy of ``self`` with all features in ``other`` removed.
        """
        new_set = deepcopy(self)
        new_set.filter(features=other.vectorizer.feature_names_,
                       inverse=True)
        return new_set

    @property
    def has_labels(self):
        """
        :returns: Whether or not this FeatureSet has any finite labels.
        """
        if self.labels is not None:
            return not (np.issubdtype(self.labels.dtype, float) and
                        np.isnan(np.min(self.labels)))
        else:
            return False

    def __str__(self):
        """
        :returns: a string representation of FeatureSet
        """
        return str(self.__dict__)

    def __repr__(self):
        """
        :returns:  a string representation of FeatureSet
        """
        return repr(self.__dict__)

    def __getitem__(self, value):
        """
        :returns: A specific example by row number, or if given a slice,
                  a new FeatureSet containing a subset of the data.
        """
        # Check if we're slicing
        if isinstance(value, slice):
            sliced_ids = self.ids[value]
            sliced_feats = (self.features[value] if self.features is not None
                            else None)
            sliced_labels = (self.labels[value] if self.labels is not None
                             else None)
            return FeatureSet('{}_{}'.format(self.name, value), sliced_ids,
                              features=sliced_feats, labels=sliced_labels,
                              vectorizer=self.vectorizer)
        else:
            label = self.labels[value] if self.labels is not None else None
            feats = self.features[value, :]
            features = (self.vectorizer.inverse_transform(feats)[0] if
                        self.features is not None else {})
            return self.ids[value], label, features
示例#2
0
class FeatureSet(object):

    """
    Encapsulation of all of the features, values, and metadata about a given
    set of data.

    This replaces ``ExamplesTuple`` from older versions.

    :param name: The name of this feature set.
    :type name: str
    :param ids: Example IDs for this set.
    :type ids: np.array
    :param labels: labels for this set.
    :type labels: np.array
    :param features: The features for each instance represented as either a
                     list of dictionaries or an array-like (if `vectorizer` is
                     also specified).
    :type features: list of dict or array-like
    :param vectorizer: Vectorizer that created feature matrix.
    :type vectorizer: DictVectorizer or FeatureHasher

    .. note::
       If ids, labels, and/or features are not None, the number of rows in
       each array must be equal.
    """

    def __init__(self, name, ids, labels=None, features=None,
                 vectorizer=None):
        super(FeatureSet, self).__init__()
        self.name = name
        if isinstance(ids, list):
            ids = np.array(ids)
        self.ids = ids
        if isinstance(labels, list):
            labels = np.array(labels)
        self.labels = labels
        self.features = features
        self.vectorizer = vectorizer
        # Convert list of dicts to numpy array
        if isinstance(self.features, list):
            if self.vectorizer is None:
                self.vectorizer = NewDictVectorizer(sparse=True)
            self.features = self.vectorizer.fit_transform(self.features)
        if self.features is not None:
            num_feats = self.features.shape[0]
            if self.ids is None:
                raise ValueError('A list of IDs is required')
            num_ids = self.ids.shape[0]
            if num_feats != num_ids:
                raise ValueError(('Number of IDs (%s) does not equal '
                                  'number of feature rows (%s)') % (num_ids,
                                                                    num_feats))
            if self.labels is None:
                self.labels = np.empty(num_feats)
                self.labels.fill(None)
            num_labels = self.labels.shape[0]
            if num_feats != num_labels:
                raise ValueError(('Number of labels (%s) does not equal '
                                  'number of feature rows (%s)') % (num_labels,
                                                                    num_feats))

    def __contains__(self, value):
        """
        Check if example ID is in set
        """
        return value in self.ids

    def __eq__(self, other):
        """
        Check whether two featuresets are the same.

        .. note::
           We consider feature values to be equal if any differences are in the
           sixth decimal place or higher.
        """

        # We need to sort the indices for the underlying
        # feature sparse matrix in case we haven't done
        # so already.
        if not self.features.has_sorted_indices:
            self.features.sort_indices()
        if not other.features.has_sorted_indices:
            other.features.sort_indices()

        return (self.ids.shape == other.ids.shape and
                self.labels.shape == other.labels.shape and
                self.features.shape == other.features.shape and
                (self.ids == other.ids).all() and
                (self.labels == other.labels).all() and
                np.allclose(self.features.data, other.features.data,
                            rtol=1e-6) and
                (self.features.indices == other.features.indices).all() and
                (self.features.indptr == other.features.indptr).all() and
                self.vectorizer == other.vectorizer)

    def __iter__(self):
        """
        Iterate through (ID, label, feature_dict) tuples in feature set.
        """
        if self.features is not None:
            if not isinstance(self.vectorizer, DictVectorizer):
                raise ValueError('FeatureSets can only be iterated through if '
                                 'they use a DictVectorizer for their feature '
                                 'vectorizer.')
            for id_, label_, feats in zip(self.ids, self.labels,
                                          self.features):
                # When calling inverse_transform we have to add [0] to get the
                # results for the current instance because it always returns a
                # 2D array
                yield (id_, label_,
                       self.vectorizer.inverse_transform(feats)[0])
        else:
            return

    def __len__(self):
        return self.features.shape[0]

    def __add__(self, other):
        """
        Combine two feature sets to create a new one.  This is done assuming
        they both have the same instances with the same IDs in the same order.
        """

        # Check that the sets of IDs are equal
        if set(self.ids) != set(other.ids):
            raise ValueError('IDs are not in the same order in each '
                             'feature set')
        # Compute the relative ordering of IDs for merging the features
        # and labels.
        ids_indices = dict((y, x) for x, y in enumerate(other.ids))
        relative_order = [ids_indices[self_id] for self_id in self.ids]

        # Initialize the new feature set with a name and the IDs.
        new_set = FeatureSet('+'.join(sorted([self.name, other.name])),
                             deepcopy(self.ids))

        # Combine feature matrices and vectorizers.
        if not isinstance(self.vectorizer, type(other.vectorizer)):
            raise ValueError('Cannot combine FeatureSets because they are '
                             'not both using the same type of feature '
                             'vectorizer (e.g., DictVectorizer, '
                             'FeatureHasher)')
        uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher)
        if uses_feature_hasher:
            if (self.vectorizer.n_features !=
                    other.vectorizer.n_features):
                raise ValueError('Cannot combine FeatureSets that uses '
                                 'FeatureHashers with different values of '
                                 'n_features setting.')
        else:
            # Check for duplicate feature names.
            if (set(self.vectorizer.feature_names_) &
                    set(other.vectorizer.feature_names_)):
                raise ValueError('Cannot combine FeatureSets because they '
                                 'have duplicate feature names.')
        num_feats = self.features.shape[1]

        new_set.features = sp.hstack([self.features,
                                      other.features[relative_order]],
                                     'csr')
        new_set.vectorizer = deepcopy(self.vectorizer)
        if not uses_feature_hasher:
            for feat_name, index in other.vectorizer.vocabulary_.items():
                new_set.vectorizer.vocabulary_[feat_name] = (index +
                                                             num_feats)
            other_names = other.vectorizer.feature_names_
            new_set.vectorizer.feature_names_.extend(other_names)

        # If either set has labels, check that they don't conflict.
        if self.has_labels:
            # labels should be the same for each FeatureSet, so store once.
            if other.has_labels and \
                    not np.all(self.labels == other.labels[relative_order]):
                raise ValueError('Feature sets have conflicting labels for '
                                 'examples with the same ID.')
            new_set.labels = deepcopy(self.labels)
        else:
            new_set.labels = deepcopy(other.labels[relative_order])

        return new_set

    def filter(self, ids=None, labels=None, features=None, inverse=False):
        """
        Removes or keeps features and/or examples from the Featureset depending
        on the passed in parameters.

        :param ids: Examples to keep in the FeatureSet. If `None`, no ID
                    filtering takes place.
        :type ids: list of str/float
        :param labels: labels that we want to retain examples for. If `None`,
                        no label filtering takes place.
        :type labels: list of str/float
        :param features: Features to keep in the FeatureSet. To help with
                         filtering string-valued features that were converted
                         to sequences of boolean features when read in, any
                         features in the FeatureSet that contain a `=` will be
                         split on the first occurrence and the prefix will be
                         checked to see if it is in `features`.
                         If `None`, no feature filtering takes place.
                         Cannot be used if FeatureSet uses a FeatureHasher for
                         vectorization.
        :type features: list of str
        :param inverse: Instead of keeping features and/or examples in lists,
                        remove them.
        :type inverse: bool
        """
        # Construct mask that indicates which examples to keep
        mask = np.ones(len(self), dtype=bool)
        if ids is not None:
            mask = np.logical_and(mask, np.in1d(self.ids, ids))
        if labels is not None:
            mask = np.logical_and(mask, np.in1d(self.labels, labels))

        if inverse and (labels is not None or ids is not None):
            mask = np.logical_not(mask)

        # Remove examples not in mask
        self.ids = self.ids[mask]
        self.labels = self.labels[mask]
        self.features = self.features[mask, :]

        # Filter features
        if features is not None:
            if isinstance(self.vectorizer, FeatureHasher):
                raise ValueError('FeatureSets with FeatureHasher vectorizers'
                                 ' cannot be filtered by feature.')
            columns = np.array(sorted({feat_num for feat_name, feat_num in
                                       iteritems(self.vectorizer.vocabulary_)
                                       if (feat_name in features or
                                           feat_name.split('=', 1)[0] in
                                           features)}))
            if inverse:
                all_columns = np.arange(self.features.shape[1])
                columns = all_columns[np.logical_not(np.in1d(all_columns,
                                                             columns))]
            self.features = self.features[:, columns]
            self.vectorizer.restrict(columns, indices=True)

    def filtered_iter(self, ids=None, labels=None, features=None,
                      inverse=False):
        """
        A version of ``__iter__`` that retains only the specified features
        and/or examples from the output.

        :param ids: Examples in the FeatureSet to keep. If `None`, no ID
                    filtering takes place.
        :type ids: list of str/float
        :param labels: labels that we want to retain examples for. If `None`,
                       no label filtering takes place.
        :type labels: list of str/float
        :param features: Features in the FeatureSet to keep. To help with
                         filtering string-valued features that were converted
                         to sequences of boolean features when read in, any
                         features in the FeatureSet that contain a `=` will be
                         split on the first occurrence and the prefix will be
                         checked to see if it is in `features`.
                         If `None`, no feature filtering takes place.
                         Cannot be used if FeatureSet uses a FeatureHasher for
                         vectorization.
        :type features: list of str
        :param inverse: Instead of keeping features and/or examples in lists,
                        remove them.
        :type inverse: bool
        """
        if self.features is not None and not isinstance(self.vectorizer,
                                                        DictVectorizer):
            raise ValueError('FeatureSets can only be iterated through if they'
                             ' use a DictVectorizer for their feature '
                             'vectorizer.')

        for id_, label_, feats in zip(self.ids, self.labels, self.features):
            # Skip instances with IDs not in filter
            if ids is not None and (id_ in ids) == inverse:
                continue
            # Skip instances with labels not in filter
            if labels is not None and (label_ in labels) == inverse:
                continue
            feat_dict = self.vectorizer.inverse_transform(feats)[0]
            if features is not None:
                feat_dict = {name: value for name, value in
                             iteritems(feat_dict) if
                             (inverse != (name in features or
                                          name.split('=', 1)[0] in features))}
            elif not inverse:
                feat_dict = {}
            yield id_, label_, feat_dict

    def __sub__(self, other):
        """
        :returns: a copy of ``self`` with all features in ``other`` removed.
        """
        new_set = deepcopy(self)
        new_set.filter(features=other.vectorizer.feature_names_,
                       inverse=True)
        return new_set

    @property
    def has_labels(self):
        """
        :returns: Whether or not this FeatureSet has any finite labels.
        """
        if self.labels is not None:
            return not (np.issubdtype(self.labels.dtype, float) and
                        np.isnan(np.min(self.labels)))
        else:
            return False

    def __str__(self):
        """
        :returns: a string representation of FeatureSet
        """
        return str(self.__dict__)

    def __repr__(self):
        """
        :returns:  a string representation of FeatureSet
        """
        return repr(self.__dict__)

    def __getitem__(self, value):
        """
        :returns: A specific example by row number, or if given a slice,
                  a new FeatureSet containing a subset of the data.
        """
        # Check if we're slicing
        if isinstance(value, slice):
            sliced_ids = self.ids[value]
            sliced_feats = (self.features[value] if self.features is not None
                            else None)
            sliced_labels = (self.labels[value] if self.labels is not None
                             else None)
            return FeatureSet('{}_{}'.format(self.name, value), sliced_ids,
                              features=sliced_feats, labels=sliced_labels,
                              vectorizer=self.vectorizer)
        else:
            label = self.labels[value] if self.labels is not None else None
            feats = self.features[value, :]
            features = (self.vectorizer.inverse_transform(feats)[0] if
                        self.features is not None else {})
            return self.ids[value], label, features
示例#3
0
class FeatureSet(object):

    """
    Encapsulation of all of the features, values, and metadata about a given
    set of data.

    This replaces ExamplesTuple in older versions.

    :param name: The name of this feature set.
    :type name: str
    :param ids: Example IDs for this set.  If
    :type ids: np.array
    :param classes: Classes for this set.
    :type classes: np.array
    :param features: The features for each instance represented as either a
                     list of dictionaries or an array-like (if
                     `feat_vectorizer` is also specified).
    :type features: list of dict or array-like
    :param vectorizer: Vectorizer that created feature matrix.
    :type vectorizer: DictVectorizer or FeatureHasher

    .. note::
       If ids, classes, and/or features are not None, the number of rows in
       each array must be equal.
    """

    def __init__(self, name, ids=None, classes=None, features=None,
                 vectorizer=None):
        super(FeatureSet, self).__init__()
        self.name = name
        if isinstance(ids, list):
            ids = np.array(ids)
        self.ids = ids
        if isinstance(classes, list):
            classes = np.array(classes)
        self.classes = classes
        self.features = features
        self.vectorizer = vectorizer
        # Convert list of dicts to numpy array
        if isinstance(self.features, list):
            if self.vectorizer is None:
                self.vectorizer = NewDictVectorizer(sparse=True)
            self.features = self.vectorizer.fit_transform(self.features)
        if self.features is not None:
            num_feats = self.features.shape[0]
            if self.ids is None:
                self.ids = np.empty(num_feats)
                self.ids.fill(None)
            num_ids = self.ids.shape[0]
            if num_feats != num_ids:
                raise ValueError(('Number of IDs (%s) does not equal '
                                  'number of feature rows (%s)') % (num_ids,
                                                                    num_feats))
            if self.classes is None:
                self.classes = np.empty(num_feats)
                self.classes.fill(None)
            num_classes = self.classes.shape[0]
            if num_feats != num_classes:
                raise ValueError(('Number of classes ({}) does not equal '
                                  'number of feature rows({})') % (num_classes,
                                                                   num_feats))

    def __contains__(self, value):
        pass

    def __iter__(self):
        '''
        Iterate through (ID, class, feature_dict) tuples in feature set.
        '''
        if self.features is not None:
            if not isinstance(self.vectorizer, DictVectorizer):
                raise ValueError('FeatureSets can only be iterated through if '
                                 'they use a DictVectorizer for their feature '
                                 'vectorizer.')
            for id_, class_, feats in zip(self.ids, self.classes,
                                          self.features):
                # When calling inverse_transform we have to add [0] to get the
                # results for the current instance because it always returns a
                # 2D array
                yield (id_, class_,
                       self.vectorizer.inverse_transform(feats)[0])
        else:
            return

    def __len__(self):
        return self.features.shape[1]

    def __add__(self, other):
        '''
        Combine two feature sets to create a new one.  This is done assuming
        they both have the same instances with the same IDs in the same order.
        '''
        new_set = FeatureSet('+'.join(sorted([self.name, other.name])))
        # Combine feature matrices and vectorizers
        if self.features is not None:
            if not isinstance(self.vectorizer, type(other.vectorizer)):
                raise ValueError('Cannot combine FeatureSets because they are '
                                 'not both using the same type of feature '
                                 'vectorizer (e.g., DictVectorizer, '
                                 'FeatureHasher)')
            feature_hasher = isinstance(self.vectorizer, FeatureHasher)
            if feature_hasher:
                if (self.vectorizer.n_features !=
                        other.vectorizer.n_features):
                    raise ValueError('Cannot combine FeatureSets that uses '
                                     'FeatureHashers with different values of '
                                     'n_features setting.')
            else:
                # Check for duplicate feature names
                if (set(self.vectorizer.feature_names_) &
                        set(other.vectorizer.feature_names_)):
                    raise ValueError('Cannot combine FeatureSets because they '
                                     'have duplicate feature names.')
            num_feats = self.features.shape[1]
            new_set.features = sp.hstack([self.features, other.features],
                                         'csr')
            new_set.vectorizer = deepcopy(self.vectorizer)
            if not feature_hasher:
                for feat_name, index in other.vectorizer.vocabulary_.items():
                    new_set.vectorizer.vocabulary_[feat_name] = (index +
                                                                 num_feats)
                other_names = other.vectorizer.feature_names_
                new_set.vectorizer.feature_names_.extend(other_names)
        else:
            new_set.features = deepcopy(other.features)
            new_set.vectorizer = deepcopy(other.vectorizer)

        # Check that IDs are in the same order
        if self.has_ids:
            if other.has_ids and not np.all(self.ids == other.ids):
                raise ValueError('IDs are not in the same order in each '
                                 'feature set')
            else:
                new_set.ids = deepcopy(self.ids)
        else:
            new_set.ids = deepcopy(other.ids)

        # If either set has labels, check that they don't conflict
        if self.has_classes:
            # Classes should be the same for each ExamplesTuple, so store once
            if other.has_classes and not np.all(self.classes == other.classes):
                raise ValueError('Feature sets have conflicting labels for '
                                 'examples with the same ID.')
            else:
                new_set.classes = deepcopy(self.classes)
        else:
            new_set.classes = deepcopy(other.classes)
        return new_set

    def filter(self, ids=None, classes=None, features=None, inverse=False):
        '''
        Removes or keeps features and/or examples from the Featureset depending
        on the passed in parameters.

        :param ids: Examples to keep in the FeatureSet. If `None`, no ID
                    filtering takes place.
        :type ids: list of str/float
        :param classes: Classes that we want to retain examples for. If `None`,
                        no class filtering takes place.
        :type classes: list of str/float
        :param features: Features to keep in the FeatureSet. To help with
                         filtering string-valued features that were converted
                         to sequences of boolean features when read in, any
                         features in the FeatureSet that contain a `=` will be
                         split on the first occurrence and the prefix will be
                         checked to see if it is in `features`.
                         If `None`, no feature filtering takes place.
                         Cannot be used if FeatureSet uses a FeatureHasher for
                         vectorization.
        :type features: list of str
        :param inverse: Instead of keeping features and/or examples in lists,
                        remove them.
        :type inverse: bool
        '''
        # Construct mask that indicates which examples to keep
        mask = np.ones(len(self), dtype=bool)
        if ids is not None:
            mask = np.logical_and(mask, np.logical_not(np.in1d(self.ids, ids)))
        if classes is not None:
            mask = np.logical_and(mask, np.logical_not(np.in1d(self.classes,
                                                               classes)))
        if inverse:
            mask = np.logical_not(mask)

        # Remove examples not in mask
        self.ids = self.ids[mask]
        self.classes = self.classes[mask]
        self.features = self.features[mask, :]

        # Filter features
        if features is not None:
            if isinstance(self.vectorizer, FeatureHasher):
                raise ValueError('FeatureSets with FeatureHasher vectorizers'
                                 ' cannot be filtered by feature.')
            columns = np.array(sorted({feat_num for feat_name, feat_num in
                                       iteritems(self.vectorizer.vocabulary_)
                                       if (feat_name in features or
                                           feat_name.split('=', 1)[0] in
                                           features)}))
            if inverse:
                columns = ~columns
            self.features = self.features[:, columns]
            self.vectorizer.restrict(columns)

    def filtered_iter(self, ids=None, classes=None, features=None,
                      inverse=False):
        '''
        A version of ``__iter__`` that retains only the specified features
        and/or examples from the output.

        :param ids: Examples in the FeatureSet to keep. If `None`, no ID
                    filtering takes place.
        :type ids: list of str/float
        :param classes: Classes that we want to retain examples for. If `None`,
                        no class filtering takes place.
        :type classes: list of str/float
        :param features: Features in the FeatureSet to keep. To help with
                         filtering string-valued features that were converted
                         to sequences of boolean features when read in, any
                         features in the FeatureSet that contain a `=` will be
                         split on the first occurrence and the prefix will be
                         checked to see if it is in `features`.
                         If `None`, no feature filtering takes place.
                         Cannot be used if FeatureSet uses a FeatureHasher for
                         vectorization.
        :type features: list of str
        :param inverse: Instead of keeping features and/or examples in lists,
                        remove them.
        :type inverse: bool
        '''
        if self.features is not None and not isinstance(self.vectorizer,
                                                        DictVectorizer):
            raise ValueError('FeatureSets can only be iterated through if they'
                             ' use a DictVectorizer for their feature '
                             'vectorizer.')

        for id_, class_, feats in zip(self.ids, self.classes, self.features):
            # Skip instances with IDs not in filter
            if ids is not None and (id_ in ids) == inverse:
                continue
            # Skip instances with classes not in filter
            if classes is not None and (class_ in classes) == inverse:
                continue
            feat_dict = self.vectorizer.inverse_transform(feats)[0]
            if features is not None:
                feat_dict = {name: value for name, value in
                             iteritems(feat_dict) if
                             (inverse != (name in features) or
                              (name.split('=', 1)[0] in features))}
            elif not inverse:
                feat_dict = {}
            yield id_, class_, feat_dict


    def __sub__(self, other):
        '''
        Return a copy of ``self`` with all features in ``other`` removed.
        '''
        new_set = deepcopy(self)
        new_set.filter(features=other.features, inverse=True)
        return new_set

    @property
    def has_classes(self):
        '''
        Whether or not this FeatureSet has any finite classes.
        '''
        if self.classes is not None:
            return not (np.issubdtype(self.classes.dtype, float) and
                        np.isnan(np.min(self.classes)))
        else:
            return False

    @property
    def has_ids(self):
        '''
        Whether or not this FeatureSet has any finite IDs.
        '''
        if self.ids is not None:
            return not (np.issubdtype(self.ids.dtype, float) and
                        np.isnan(np.min(self.ids)))
        else:
            return False

    @property
    def feat_vectorizer(self):
        ''' Backward compatible name for vectorizer '''
        warn('FeatureSet.feat_vectorizer will be removed in SKLL 1.0.0. '
             'Please switch to using FeatureSet.vectorizer to access the '
             'feature vectorizer.', DeprecationWarning)
        return self.vectorizer

    def __str__(self):
        ''' Return a string representation of FeatureSet '''
        return str(self.__dict__)

    def __repr__(self):
        ''' Return a string representation of FeatureSet '''
        return repr(self.__dict__)
示例#4
0
class FeatureSet(object):
    """
    Encapsulation of all of the features, values, and metadata about a given
    set of data. This replaces `ExamplesTuple` from older versions of SKLL.

    Parameters
    ----------
    name : str
        The name of this feature set.
    ids : np.array
        Example IDs for this set.
    labels : np.array, optional
        labels for this set.
        Defaults to ``None``.
    feature : list of dict or array-like, optional
        The features for each instance represented as either a
        list of dictionaries or an array-like (if `vectorizer` is
        also specified).
        Defaults to ``None``.
    vectorizer : DictVectorizer or FeatureHasher, optional
        Vectorizer which will be used to generate the feature matrix.
        Defaults to ``None``.

    Warnings
    --------
    FeatureSets can only be equal if the order of the instances is
    identical because these are stored as lists/arrays. Since scikit-learn's
    `DictVectorizer` automatically sorts the underlying feature matrix
    if it is sparse, we do not do any sorting before checking for equality.
    This is not a problem because we _always_ use sparse matrices with
    `DictVectorizer` when creating FeatureSets.

    Notes
    -----
    If ids, labels, and/or features are not None, the number of rows in
    each array must be equal.
    """
    def __init__(self, name, ids, labels=None, features=None, vectorizer=None):
        super(FeatureSet, self).__init__()
        self.name = name
        if isinstance(ids, list):
            ids = np.array(ids)
        self.ids = ids
        if isinstance(labels, list):
            labels = np.array(labels)
        self.labels = labels
        self.features = features
        self.vectorizer = vectorizer
        # Convert list of dicts to numpy array
        if isinstance(self.features, list):
            if self.vectorizer is None:
                self.vectorizer = NewDictVectorizer(sparse=True)
            self.features = self.vectorizer.fit_transform(self.features)
        if self.features is not None:
            num_feats = self.features.shape[0]
            if self.ids is None:
                raise ValueError('A list of IDs is required')
            num_ids = self.ids.shape[0]
            if num_feats != num_ids:
                raise ValueError(
                    ('Number of IDs (%s) does not equal '
                     'number of feature rows (%s)') % (num_ids, num_feats))
            if self.labels is None:
                self.labels = np.empty(num_feats)
                self.labels.fill(None)
            num_labels = self.labels.shape[0]
            if num_feats != num_labels:
                raise ValueError(
                    ('Number of labels (%s) does not equal '
                     'number of feature rows (%s)') % (num_labels, num_feats))

    def __contains__(self, value):
        """
        Check if example ID is in the FeatureSet.

        Parameters
        ----------
        value
            The value to check.
        """
        return value in self.ids

    def __eq__(self, other):
        """
        Check whether two featuresets are the same.

        Parameters
        ----------
        other : skll.FeatureSet
            The other ``FeatureSet`` to check equivalence with.

        Note
        ----
        We consider feature values to be equal if any differences are in the
        sixth decimal place or higher.
        """

        return (self.ids.shape == other.ids.shape
                and self.labels.shape == other.labels.shape
                and self.features.shape == other.features.shape
                and (self.ids == other.ids).all()
                and (self.labels == other.labels).all() and np.allclose(
                    self.features.data, other.features.data, rtol=1e-6)
                and (self.features.indices == other.features.indices).all()
                and (self.features.indptr == other.features.indptr).all()
                and self.vectorizer == other.vectorizer)

    def __iter__(self):
        """
        Iterate through (ID, label, feature_dict) tuples in feature set.
        """
        if self.features is not None:
            if not isinstance(self.vectorizer, DictVectorizer):
                raise ValueError('FeatureSets can only be iterated through if '
                                 'they use a DictVectorizer for their feature '
                                 'vectorizer.')
            for id_, label_, feats in zip(self.ids, self.labels,
                                          self.features):

                # reshape to a 2D matrix if we are not using a sparse matrix
                # to store the features
                feats = feats.reshape(1,
                                      -1) if not sp.issparse(feats) else feats

                # When calling inverse_transform we have to add [0] to get the
                # results for the current instance because it always returns a
                # 2D array
                yield (id_, label_,
                       self.vectorizer.inverse_transform(feats)[0])
        else:
            return

    def __len__(self):
        """
        The number of rows in the ``FeatureSet`` instance.
        """
        return self.features.shape[0]

    def __add__(self, other):
        """
        Combine two feature sets to create a new one.  This is done assuming
        they both have the same instances with the same IDs in the same order.

        Parameters
        ----------
        other : skll.FeatureSet
            The other ``FeatureSet`` to add to this one.

        Raises
        ------
        ValueError
            If IDs are not in the same order in each ``FeatureSet`` instance.
        ValueError
            If vectorizers are different between the two ``FeatureSet`` instances.
        ValueError
            If there are duplicate feature names.
        ValueError
            If there are conflicting labels.
        """

        # Check that the sets of IDs are equal
        if set(self.ids) != set(other.ids):
            raise ValueError('IDs are not in the same order in each '
                             'feature set')
        # Compute the relative ordering of IDs for merging the features
        # and labels.
        ids_indices = dict((y, x) for x, y in enumerate(other.ids))
        relative_order = [ids_indices[self_id] for self_id in self.ids]

        # Initialize the new feature set with a name and the IDs.
        new_set = FeatureSet('+'.join(sorted([self.name, other.name])),
                             deepcopy(self.ids))

        # Combine feature matrices and vectorizers.
        if not isinstance(self.vectorizer, type(other.vectorizer)):
            raise ValueError('Cannot combine FeatureSets because they are '
                             'not both using the same type of feature '
                             'vectorizer (e.g., DictVectorizer, '
                             'FeatureHasher)')
        uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher)
        if uses_feature_hasher:
            if (self.vectorizer.n_features != other.vectorizer.n_features):
                raise ValueError('Cannot combine FeatureSets that uses '
                                 'FeatureHashers with different values of '
                                 'n_features setting.')
        else:
            # Check for duplicate feature names.
            if (set(self.vectorizer.feature_names_)
                    & set(other.vectorizer.feature_names_)):
                raise ValueError('Cannot combine FeatureSets because they '
                                 'have duplicate feature names.')
        num_feats = self.features.shape[1]

        new_set.features = sp.hstack(
            [self.features, other.features[relative_order]], 'csr')
        new_set.vectorizer = deepcopy(self.vectorizer)
        if not uses_feature_hasher:
            for feat_name, index in other.vectorizer.vocabulary_.items():
                new_set.vectorizer.vocabulary_[feat_name] = (index + num_feats)
            other_names = other.vectorizer.feature_names_
            new_set.vectorizer.feature_names_.extend(other_names)

        # If either set has labels, check that they don't conflict.
        if self.has_labels:
            # labels should be the same for each FeatureSet, so store once.
            if other.has_labels and \
                    not np.all(self.labels == other.labels[relative_order]):
                raise ValueError('Feature sets have conflicting labels for '
                                 'examples with the same ID.')
            new_set.labels = deepcopy(self.labels)
        else:
            new_set.labels = deepcopy(other.labels[relative_order])

        return new_set

    def filter(self, ids=None, labels=None, features=None, inverse=False):
        """
        Removes or keeps features and/or examples from the `Featureset` depending
        on the parameters. Filtering is done in-place.

        Parameters
        ----------
        ids : list of str/float, optional
            Examples to keep in the FeatureSet. If `None`, no ID
            filtering takes place.
            Defaults to ``None``.
        labels : list of str/float, optional
            Labels that we want to retain examples for. If `None`,
            no label filtering takes place.
            Defaults to ``None``.
        features : list of str, optional
            Features to keep in the FeatureSet. To help with
            filtering string-valued features that were converted
            to sequences of boolean features when read in, any
            features in the FeatureSet that contain a `=` will be
            split on the first occurrence and the prefix will be
            checked to see if it is in `features`.
            If `None`, no feature filtering takes place.
            Cannot be used if FeatureSet uses a FeatureHasher for
            vectorization.
            Defaults to ``None``.
        inverse : bool, optional
            Instead of keeping features and/or examples in lists,
            remove them.
            Defaults to ``False``.

        Raises
        ------
        ValueError
            If attempting to use features to filter a ``FeatureSet`` that
            uses a ``FeatureHasher`` vectorizer.
        """
        # Construct mask that indicates which examples to keep
        mask = np.ones(len(self), dtype=bool)
        if ids is not None:
            mask = np.logical_and(mask, np.in1d(self.ids, ids))
        if labels is not None:
            mask = np.logical_and(mask, np.in1d(self.labels, labels))

        if inverse and (labels is not None or ids is not None):
            mask = np.logical_not(mask)

        # Remove examples not in mask
        self.ids = self.ids[mask]
        self.labels = self.labels[mask]
        self.features = self.features[mask, :]

        # Filter features
        if features is not None:
            if isinstance(self.vectorizer, FeatureHasher):
                raise ValueError('FeatureSets with FeatureHasher vectorizers'
                                 ' cannot be filtered by feature.')
            columns = np.array(
                sorted({
                    feat_num
                    for feat_name, feat_num in
                    self.vectorizer.vocabulary_.items()
                    if (feat_name in features
                        or feat_name.split('=', 1)[0] in features)
                }))
            if inverse:
                all_columns = np.arange(self.features.shape[1])
                columns = all_columns[np.logical_not(
                    np.in1d(all_columns, columns))]
            self.features = self.features[:, columns]
            self.vectorizer.restrict(columns, indices=True)

    def filtered_iter(self,
                      ids=None,
                      labels=None,
                      features=None,
                      inverse=False):
        """
        A version of `__iter__` that retains only the specified features
        and/or examples from the output.

        Parameters
        ----------
        ids : list of str/float, optional
            Examples to keep in the ``FeatureSet``. If ``None``, no ID
            filtering takes place.
            Defaults to ``None``.
        labels : list of str/float, optional
            Labels that we want to retain examples for. If ``None``,
            no label filtering takes place.
            Defaults to ``None``.
        features : list of str, optional
            Features to keep in the ``FeatureSet``. To help with
            filtering string-valued features that were converted
            to sequences of boolean features when read in, any
            features in the ``FeatureSet`` that contain a `=` will be
            split on the first occurrence and the prefix will be
            checked to see if it is in ``features``.
            If `None`, no feature filtering takes place.
            Cannot be used if ``FeatureSet`` uses a FeatureHasher for
            vectorization.
            Defaults to ``None``.
        inverse : bool, optional
            Instead of keeping features and/or examples in lists,
            remove them.
            Defaults to ``False``.

        Yields
        ------
        id_ : str
            The ID of the example.
        label_ : str
            The label of the example.
        feat_dict : dict
            The feature dictionary, with feature name as the key
            and example value as the value.

        Raises
        ------
        ValueError
            If the vectorizer is not a `DictVectorizer`.
        """
        if self.features is not None and not isinstance(
                self.vectorizer, DictVectorizer):
            raise ValueError('FeatureSets can only be iterated through if they'
                             ' use a DictVectorizer for their feature '
                             'vectorizer.')

        for id_, label_, feats in zip(self.ids, self.labels, self.features):
            # Skip instances with IDs not in filter
            if ids is not None and (id_ in ids) == inverse:
                continue
            # Skip instances with labels not in filter
            if labels is not None and (label_ in labels) == inverse:
                continue

            # reshape to a 2D matrix if we are not using a sparse matrix
            # to store the features
            feats = feats.reshape(1, -1) if not sp.issparse(feats) else feats
            feat_dict = self.vectorizer.inverse_transform(feats)[0]
            if features is not None:
                feat_dict = {
                    name: value
                    for name, value in feat_dict.items() if (inverse != (
                        name in features or name.split('=', 1)[0] in features))
                }
            elif not inverse:
                feat_dict = {}
            yield id_, label_, feat_dict

    def __sub__(self, other):
        """
        Subset ``FeatureSet`` instance by removing all the features from the
        other ``FeatureSet`` instance.

        Parameters
        ----------
        other : skll.FeatureSet
            The other ``FeatureSet`` containing the features that should
            be removed from this ``FeatureSet``.

        Returns
        -------
        A copy of `self` with all features in `other` removed.
        """
        new_set = deepcopy(self)
        new_set.filter(features=other.vectorizer.feature_names_, inverse=True)
        return new_set

    @property
    def has_labels(self):
        """
        Check if ``FeatureSet`` has finite labels.

        Returns
        -------
        has_labels : bool
            Whether or not this FeatureSet has any finite labels.
        """
        # make sure that labels is not None or a list of Nones
        if self.labels is not None and not all(label is None
                                               for label in self.labels):
            # then check that they are not a list of NaNs
            return not (np.issubdtype(self.labels.dtype, np.floating)
                        and np.isnan(np.min(self.labels)))
        else:
            return False

    def __str__(self):
        """
        Returns
        -------
        A string representation of ``FeatureSet``.
        """
        return str(self.__dict__)

    def __repr__(self):
        """
        Returns
        -------
        A string representation of ``FeatureSet``.
        """
        return repr(self.__dict__)

    def __getitem__(self, value):
        """
        Parameters
        ----------
        value
            The value to retrieve.

        Returns
        -------
        A specific example by row number or, if given a slice,
        a new ``FeatureSet`` instance containing a subset of the data.
        """
        # Check if we're slicing
        if isinstance(value, slice):
            sliced_ids = self.ids[value]
            sliced_feats = (self.features[value]
                            if self.features is not None else None)
            sliced_labels = (self.labels[value]
                             if self.labels is not None else None)
            return FeatureSet('{}_{}'.format(self.name, value),
                              sliced_ids,
                              features=sliced_feats,
                              labels=sliced_labels,
                              vectorizer=self.vectorizer)
        else:
            label = self.labels[value] if self.labels is not None else None
            feats = self.features[value, :]
            features = (self.vectorizer.inverse_transform(feats)[0]
                        if self.features is not None else {})
            return self.ids[value], label, features

    @staticmethod
    def split_by_ids(fs, ids_for_split1, ids_for_split2=None):
        """
        Split the ``FeatureSet`` into two new ``FeatureSet`` instances based on
        the given IDs for the two splits.

        Parameters
        ----------
        fs : skll.FeatureSet
            The ``FeatureSet`` instance to split.
        ids_for_split1 : list of int
            A list of example IDs which will be split out into
            the first ``FeatureSet`` instance. Note that the
            FeatureSet instance will respect the order of the
            specified IDs.
        ids_for_split2 : list of int, optional
            An optional ist of example IDs which will be
            split out into the second ``FeatureSet`` instance.
            Note that the ``FeatureSet`` instance will respect
            the order of the specified IDs. If this is
            not specified, then the second ``FeatureSet``
            instance will contain the complement of the
            first set of IDs sorted in ascending order.
            Defaults to ``None``.

        Returns
        -------
        fs1 : skll.FeatureSet
            The first ``FeatureSet``.
        fs2 : skll.FeatureSet
            The second ``FeatureSet``.
        """

        # Note: an alternative way to implement this is to make copies
        # of the given FeatureSet instance and then use the `filter()`
        # method but that wastes too much memory since it requires making
        # two copies of the original FeatureSet which may be huge. With
        # the current implementation, we are creating new objects but
        # they should be much smaller than the original FeatureSet.
        ids1 = fs.ids[ids_for_split1]
        labels1 = fs.labels[ids_for_split1]
        features1 = fs.features[ids_for_split1]
        if ids_for_split2 is None:
            ids2 = fs.ids[~np.in1d(fs.ids, ids_for_split1)]
            labels2 = fs.labels[~np.in1d(fs.ids, ids_for_split1)]
            features2 = fs.features[~np.in1d(fs.ids, ids_for_split1)]
        else:
            ids2 = fs.ids[ids_for_split2]
            labels2 = fs.labels[ids_for_split2]
            features2 = fs.features[ids_for_split2]

        fs1 = FeatureSet('{}_1'.format(fs.name),
                         ids1,
                         labels=labels1,
                         features=features1,
                         vectorizer=fs.vectorizer)
        fs2 = FeatureSet('{}_2'.format(fs.name),
                         ids2,
                         labels=labels2,
                         features=features2,
                         vectorizer=fs.vectorizer)
        return fs1, fs2

    @staticmethod
    def from_data_frame(df, name, labels_column=None, vectorizer=None):
        """
        Helper function to create a ``FeatureSet`` instance from a `pandas.DataFrame`.
        Will raise an Exception if pandas is not installed in your environment.
        The ``ids`` in the ``FeatureSet`` will be the index from the given frame.

        Parameters
        ----------
        df : pd.DataFrame
            The pandas.DataFrame object to use as a ``FeatureSet``.
        name : str
            The name of the output ``FeatureSet`` instance.
        labels_column : str, optional
            The name of the column containing the labels (data to predict).
            Defaults to ``None``.
        vectorizer : DictVectorizer or FeatureHasher, optional
            Vectorizer which will be used to generate the feature matrix.
            Defaults to ``None``.

        Returns
        -------
        feature_set : skll.FeatureSet
            A ``FeatureSet`` instance generated from from the given data frame.
        """
        if labels_column:
            feature_columns = [
                column for column in df.columns if column != labels_column
            ]
            labels = df[labels_column].tolist()
        else:
            feature_columns = df.columns
            labels = None

        features = df[feature_columns].to_dict(orient='records')
        return FeatureSet(name,
                          ids=df.index.tolist(),
                          labels=labels,
                          features=features,
                          vectorizer=vectorizer)
class FeatureSet(object):

    """
    Encapsulation of all of the features, values, and metadata about a given
    set of data. This replaces `ExamplesTuple` from older versions of SKLL.

    Parameters
    ----------
    name : str
        The name of this feature set.
    ids : np.array
        Example IDs for this set.
    labels : np.array, optional
        labels for this set.
        Defaults to ``None``.
    feature : list of dict or array-like, optional
        The features for each instance represented as either a
        list of dictionaries or an array-like (if `vectorizer` is
        also specified).
        Defaults to ``None``.
    vectorizer : DictVectorizer or FeatureHasher, optional
        Vectorizer which will be used to generate the feature matrix.
        Defaults to ``None``.

    Warnings
    --------
    FeatureSets can only be equal if the order of the instances is
    identical because these are stored as lists/arrays. Since scikit-learn's
    `DictVectorizer` automatically sorts the underlying feature matrix
    if it is sparse, we do not do any sorting before checking for equality.
    This is not a problem because we _always_ use sparse matrices with
    `DictVectorizer` when creating FeatureSets.

    Notes
    -----
    If ids, labels, and/or features are not None, the number of rows in
    each array must be equal.
    """

    def __init__(self, name, ids, labels=None, features=None,
                 vectorizer=None):
        super(FeatureSet, self).__init__()
        self.name = name
        if isinstance(ids, list):
            ids = np.array(ids)
        self.ids = ids
        if isinstance(labels, list):
            labels = np.array(labels)
        self.labels = labels
        self.features = features
        self.vectorizer = vectorizer
        # Convert list of dicts to numpy array
        if isinstance(self.features, list):
            if self.vectorizer is None:
                self.vectorizer = NewDictVectorizer(sparse=True)
            self.features = self.vectorizer.fit_transform(self.features)
        if self.features is not None:
            num_feats = self.features.shape[0]
            if self.ids is None:
                raise ValueError('A list of IDs is required')
            num_ids = self.ids.shape[0]
            if num_feats != num_ids:
                raise ValueError(('Number of IDs (%s) does not equal '
                                  'number of feature rows (%s)') % (num_ids,
                                                                    num_feats))
            if self.labels is None:
                self.labels = np.empty(num_feats)
                self.labels.fill(None)
            num_labels = self.labels.shape[0]
            if num_feats != num_labels:
                raise ValueError(('Number of labels (%s) does not equal '
                                  'number of feature rows (%s)') % (num_labels,
                                                                    num_feats))

    def __contains__(self, value):
        """
        Check if example ID is in the FeatureSet.

        Parameters
        ----------
        value
            The value to check.
        """
        return value in self.ids

    def __eq__(self, other):
        """
        Check whether two featuresets are the same.

        Parameters
        ----------
        other : skll.FeatureSet
            The other ``FeatureSet`` to check equivalence with.

        Note
        ----
        We consider feature values to be equal if any differences are in the
        sixth decimal place or higher.
        """

        return (self.ids.shape == other.ids.shape and
                self.labels.shape == other.labels.shape and
                self.features.shape == other.features.shape and
                (self.ids == other.ids).all() and
                (self.labels == other.labels).all() and
                np.allclose(self.features.data, other.features.data,
                            rtol=1e-6) and
                (self.features.indices == other.features.indices).all() and
                (self.features.indptr == other.features.indptr).all() and
                self.vectorizer == other.vectorizer)

    def __iter__(self):
        """
        Iterate through (ID, label, feature_dict) tuples in feature set.
        """
        if self.features is not None:
            if not isinstance(self.vectorizer, DictVectorizer):
                raise ValueError('FeatureSets can only be iterated through if '
                                 'they use a DictVectorizer for their feature '
                                 'vectorizer.')
            for id_, label_, feats in zip(self.ids, self.labels, self.features):

                # reshape to a 2D matrix if we are not using a sparse matrix
                # to store the features
                feats = feats.reshape(1, -1) if not sp.issparse(feats) else feats

                # When calling inverse_transform we have to add [0] to get the
                # results for the current instance because it always returns a
                # 2D array
                yield (id_, label_, self.vectorizer.inverse_transform(feats)[0])
        else:
            return

    def __len__(self):
        """
        The number of rows in the ``FeatureSet`` instance.
        """
        return self.features.shape[0]

    def __add__(self, other):
        """
        Combine two feature sets to create a new one.  This is done assuming
        they both have the same instances with the same IDs in the same order.

        Parameters
        ----------
        other : skll.FeatureSet
            The other ``FeatureSet`` to add to this one.

        Raises
        ------
        ValueError
            If IDs are not in the same order in each ``FeatureSet`` instance.
        ValueError
            If vectorizers are different between the two ``FeatureSet`` instances.
        ValueError
            If there are duplicate feature names.
        ValueError
            If there are conflicting labels.
        """

        # Check that the sets of IDs are equal
        if set(self.ids) != set(other.ids):
            raise ValueError('IDs are not in the same order in each '
                             'feature set')
        # Compute the relative ordering of IDs for merging the features
        # and labels.
        ids_indices = dict((y, x) for x, y in enumerate(other.ids))
        relative_order = [ids_indices[self_id] for self_id in self.ids]

        # Initialize the new feature set with a name and the IDs.
        new_set = FeatureSet('+'.join(sorted([self.name, other.name])),
                             deepcopy(self.ids))

        # Combine feature matrices and vectorizers.
        if not isinstance(self.vectorizer, type(other.vectorizer)):
            raise ValueError('Cannot combine FeatureSets because they are '
                             'not both using the same type of feature '
                             'vectorizer (e.g., DictVectorizer, '
                             'FeatureHasher)')
        uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher)
        if uses_feature_hasher:
            if (self.vectorizer.n_features !=
                    other.vectorizer.n_features):
                raise ValueError('Cannot combine FeatureSets that uses '
                                 'FeatureHashers with different values of '
                                 'n_features setting.')
        else:
            # Check for duplicate feature names.
            if (set(self.vectorizer.feature_names_) &
                    set(other.vectorizer.feature_names_)):
                raise ValueError('Cannot combine FeatureSets because they '
                                 'have duplicate feature names.')
        num_feats = self.features.shape[1]

        new_set.features = sp.hstack([self.features,
                                      other.features[relative_order]],
                                     'csr')
        new_set.vectorizer = deepcopy(self.vectorizer)
        if not uses_feature_hasher:
            for feat_name, index in other.vectorizer.vocabulary_.items():
                new_set.vectorizer.vocabulary_[feat_name] = (index +
                                                             num_feats)
            other_names = other.vectorizer.feature_names_
            new_set.vectorizer.feature_names_.extend(other_names)

        # If either set has labels, check that they don't conflict.
        if self.has_labels:
            # labels should be the same for each FeatureSet, so store once.
            if other.has_labels and \
                    not np.all(self.labels == other.labels[relative_order]):
                raise ValueError('Feature sets have conflicting labels for '
                                 'examples with the same ID.')
            new_set.labels = deepcopy(self.labels)
        else:
            new_set.labels = deepcopy(other.labels[relative_order])

        return new_set

    def filter(self, ids=None, labels=None, features=None, inverse=False):
        """
        Removes or keeps features and/or examples from the `Featureset` depending
        on the parameters. Filtering is done in-place.

        Parameters
        ----------
        ids : list of str/float, optional
            Examples to keep in the FeatureSet. If `None`, no ID
            filtering takes place.
            Defaults to ``None``.
        labels : list of str/float, optional
            Labels that we want to retain examples for. If `None`,
            no label filtering takes place.
            Defaults to ``None``.
        features : list of str, optional
            Features to keep in the FeatureSet. To help with
            filtering string-valued features that were converted
            to sequences of boolean features when read in, any
            features in the FeatureSet that contain a `=` will be
            split on the first occurrence and the prefix will be
            checked to see if it is in `features`.
            If `None`, no feature filtering takes place.
            Cannot be used if FeatureSet uses a FeatureHasher for
            vectorization.
            Defaults to ``None``.
        inverse : bool, optional
            Instead of keeping features and/or examples in lists,
            remove them.
            Defaults to ``False``.

        Raises
        ------
        ValueError
            If attempting to use features to filter a ``FeatureSet`` that
            uses a ``FeatureHasher`` vectorizer.
        """
        # Construct mask that indicates which examples to keep
        mask = np.ones(len(self), dtype=bool)
        if ids is not None:
            mask = np.logical_and(mask, np.in1d(self.ids, ids))
        if labels is not None:
            mask = np.logical_and(mask, np.in1d(self.labels, labels))

        if inverse and (labels is not None or ids is not None):
            mask = np.logical_not(mask)

        # Remove examples not in mask
        self.ids = self.ids[mask]
        self.labels = self.labels[mask]
        self.features = self.features[mask, :]

        # Filter features
        if features is not None:
            if isinstance(self.vectorizer, FeatureHasher):
                raise ValueError('FeatureSets with FeatureHasher vectorizers'
                                 ' cannot be filtered by feature.')
            columns = np.array(sorted({feat_num for feat_name, feat_num in
                                       iteritems(self.vectorizer.vocabulary_)
                                       if (feat_name in features or
                                           feat_name.split('=', 1)[0] in
                                           features)}))
            if inverse:
                all_columns = np.arange(self.features.shape[1])
                columns = all_columns[np.logical_not(np.in1d(all_columns,
                                                             columns))]
            self.features = self.features[:, columns]
            self.vectorizer.restrict(columns, indices=True)

    def filtered_iter(self, ids=None, labels=None, features=None,
                      inverse=False):
        """
        A version of `__iter__` that retains only the specified features
        and/or examples from the output.

        Parameters
        ----------
        ids : list of str/float, optional
            Examples to keep in the ``FeatureSet``. If ``None``, no ID
            filtering takes place.
            Defaults to ``None``.
        labels : list of str/float, optional
            Labels that we want to retain examples for. If ``None``,
            no label filtering takes place.
            Defaults to ``None``.
        features : list of str, optional
            Features to keep in the ``FeatureSet``. To help with
            filtering string-valued features that were converted
            to sequences of boolean features when read in, any
            features in the ``FeatureSet`` that contain a `=` will be
            split on the first occurrence and the prefix will be
            checked to see if it is in ``features``.
            If `None`, no feature filtering takes place.
            Cannot be used if ``FeatureSet`` uses a FeatureHasher for
            vectorization.
            Defaults to ``None``.
        inverse : bool, optional
            Instead of keeping features and/or examples in lists,
            remove them.
            Defaults to ``False``.

        Yields
        ------
        id_ : str
            The ID of the example.
        label_ : str
            The label of the example.
        feat_dict : dict
            The feature dictionary, with feature name as the key
            and example value as the value.

        Raises
        ------
        ValueError
            If the vectorizer is not a `DictVectorizer`.
        """
        if self.features is not None and not isinstance(self.vectorizer,
                                                        DictVectorizer):
            raise ValueError('FeatureSets can only be iterated through if they'
                             ' use a DictVectorizer for their feature '
                             'vectorizer.')

        for id_, label_, feats in zip(self.ids, self.labels, self.features):
            # Skip instances with IDs not in filter
            if ids is not None and (id_ in ids) == inverse:
                continue
            # Skip instances with labels not in filter
            if labels is not None and (label_ in labels) == inverse:
                continue

            # reshape to a 2D matrix if we are not using a sparse matrix
            # to store the features
            feats = feats.reshape(1, -1) if not sp.issparse(feats) else feats
            feat_dict = self.vectorizer.inverse_transform(feats)[0]
            if features is not None:
                feat_dict = {name: value for name, value in
                             iteritems(feat_dict) if
                             (inverse != (name in features or
                                          name.split('=', 1)[0] in features))}
            elif not inverse:
                feat_dict = {}
            yield id_, label_, feat_dict

    def __sub__(self, other):
        """
        Subset ``FeatureSet`` instance by removing all the features from the
        other ``FeatureSet`` instance.

        Parameters
        ----------
        other : skll.FeatureSet
            The other ``FeatureSet`` containing the features that should
            be removed from this ``FeatureSet``.

        Returns
        -------
        A copy of `self` with all features in `other` removed.
        """
        new_set = deepcopy(self)
        new_set.filter(features=other.vectorizer.feature_names_,
                       inverse=True)
        return new_set

    @property
    def has_labels(self):
        """
        Check if ``FeatureSet`` has finite labels.

        Returns
        -------
        has_labels : bool
            Whether or not this FeatureSet has any finite labels.
        """
        # make sure that labels is not None or a list of Nones
        if self.labels is not None and not all(label is None for label in self.labels):
            # then check that they are not a list of NaNs
            return not (np.issubdtype(self.labels.dtype, np.floating) and
                        np.isnan(np.min(self.labels)))
        else:
            return False

    def __str__(self):
        """
        Returns
        -------
        A string representation of ``FeatureSet``.
        """
        return str(self.__dict__)

    def __repr__(self):
        """
        Returns
        -------
        A string representation of ``FeatureSet``.
        """
        return repr(self.__dict__)

    def __getitem__(self, value):
        """
        Parameters
        ----------
        value
            The value to retrieve.

        Returns
        -------
        A specific example by row number or, if given a slice,
        a new ``FeatureSet`` instance containing a subset of the data.
        """
        # Check if we're slicing
        if isinstance(value, slice):
            sliced_ids = self.ids[value]
            sliced_feats = (self.features[value] if self.features is not None
                            else None)
            sliced_labels = (self.labels[value] if self.labels is not None
                             else None)
            return FeatureSet('{}_{}'.format(self.name, value), sliced_ids,
                              features=sliced_feats, labels=sliced_labels,
                              vectorizer=self.vectorizer)
        else:
            label = self.labels[value] if self.labels is not None else None
            feats = self.features[value, :]
            features = (self.vectorizer.inverse_transform(feats)[0] if
                        self.features is not None else {})
            return self.ids[value], label, features

    @staticmethod
    def split_by_ids(fs, ids_for_split1, ids_for_split2=None):
        """
        Split the ``FeatureSet`` into two new ``FeatureSet`` instances based on
        the given IDs for the two splits.

        Parameters
        ----------
        fs : skll.FeatureSet
            The ``FeatureSet`` instance to split.
        ids_for_split1 : list of int
            A list of example IDs which will be split out into
            the first ``FeatureSet`` instance. Note that the
            FeatureSet instance will respect the order of the
            specified IDs.
        ids_for_split2 : list of int, optional
            An optional ist of example IDs which will be
            split out into the second ``FeatureSet`` instance.
            Note that the ``FeatureSet`` instance will respect
            the order of the specified IDs. If this is
            not specified, then the second ``FeatureSet``
            instance will contain the complement of the
            first set of IDs sorted in ascending order.
            Defaults to ``None``.

        Returns
        -------
        fs1 : skll.FeatureSet
            The first ``FeatureSet``.
        fs2 : skll.FeatureSet
            The second ``FeatureSet``.
        """

        # Note: an alternative way to implement this is to make copies
        # of the given FeatureSet instance and then use the `filter()`
        # method but that wastes too much memory since it requires making
        # two copies of the original FeatureSet which may be huge. With
        # the current implementation, we are creating new objects but
        # they should be much smaller than the original FeatureSet.
        ids1 = fs.ids[ids_for_split1]
        labels1 = fs.labels[ids_for_split1]
        features1 = fs.features[ids_for_split1]
        if ids_for_split2 is None:
            ids2 = fs.ids[~np.in1d(fs.ids, ids_for_split1)]
            labels2 = fs.labels[~np.in1d(fs.ids, ids_for_split1)]
            features2 = fs.features[~np.in1d(fs.ids, ids_for_split1)]
        else:
            ids2 = fs.ids[ids_for_split2]
            labels2 = fs.labels[ids_for_split2]
            features2 = fs.features[ids_for_split2]

        fs1 = FeatureSet('{}_1'.format(fs.name),
                         ids1,
                         labels=labels1,
                         features=features1,
                         vectorizer=fs.vectorizer)
        fs2 = FeatureSet('{}_2'.format(fs.name),
                         ids2,
                         labels=labels2,
                         features=features2,
                         vectorizer=fs.vectorizer)
        return fs1, fs2

    @staticmethod
    def from_data_frame(df, name, labels_column=None, vectorizer=None):
        """
        Helper function to create a ``FeatureSet`` instance from a `pandas.DataFrame`.
        Will raise an Exception if pandas is not installed in your environment.
        The ``ids`` in the ``FeatureSet`` will be the index from the given frame.

        Parameters
        ----------
        df : pd.DataFrame
            The pandas.DataFrame object to use as a ``FeatureSet``.
        name : str
            The name of the output ``FeatureSet`` instance.
        labels_column : str, optional
            The name of the column containing the labels (data to predict).
            Defaults to ``None``.
        vectorizer : DictVectorizer or FeatureHasher, optional
            Vectorizer which will be used to generate the feature matrix.
            Defaults to ``None``.

        Returns
        -------
        feature_set : skll.FeatureSet
            A ``FeatureSet`` instance generated from from the given data frame.
        """
        if labels_column:
            feature_columns = [column for column in df.columns if column != labels_column]
            labels = df[labels_column].tolist()
        else:
            feature_columns = df.columns
            labels = None

        features = df[feature_columns].to_dict(orient='records')
        return FeatureSet(name,
                          ids=df.index.tolist(),
                          labels=labels,
                          features=features,
                          vectorizer=vectorizer)