Пример #1
0
    def fit(self, X, unused_y=None):
        """Learn a vocabulary dictionary of all categories in X.

        Args:
            raw_documents: numpy matrix or iterable of lists/numpy arrays.
            unused_y: to match fit format signature of estimators.

        Returns:
            self
        """
        X = setup_processor_data_feeder(X)
        for row in X:
            # Create vocabularies if not given.
            if self.vocabularies_ is None:
                # If not share, one per column, else one shared across.
                if not self.share:
                    self.vocabularies_ = [
                        categorical_vocabulary.CategoricalVocabulary() for _ in row]
                else:
                    vocab = categorical_vocabulary.CategoricalVocabulary()
                    self.vocabularies_ = [vocab for _ in row]
            for idx, value in enumerate(row):
                # Nans are handled as unknowns.
                if (isinstance(value, float) and math.isnan(value)) or value == np.nan:
                    continue
                self.vocabularies_[idx].add(value)
        if self.min_frequency > 0:
            for vocab in self.vocabularies_:
                vocab.trim(self.min_frequency)
        self.freeze()
        return self
 def testIntVocabulary(self):
     vocab = categorical_vocabulary.CategoricalVocabulary()
     self.assertEqual(vocab.get(1), 1)
     self.assertEqual(vocab.get(3), 2)
     self.assertEqual(vocab.get(2), 3)
     self.assertEqual(vocab.get(3), 2)
     self.assertEqual(vocab.get(float('nan')), 4)
Пример #3
0
 def testIntVocabulary(self):
     vocab = categorical_vocabulary.CategoricalVocabulary()
     self.assertEqual(vocab.get(1), 1)
     self.assertEqual(vocab.get(3), 2)
     self.assertEqual(vocab.get(2), 3)
     self.assertEqual(vocab.get(3), 2)
     # This vocab doesn't handle nan specially.
     self.assertEqual(vocab.get(float('nan')), 4)
     self.assertEqual(len(vocab), 5)
Пример #4
0
 def testCountsTrim(self):
     vocab = categorical_vocabulary.CategoricalVocabulary()
     vocab.get('a')
     vocab.add('a', 10)
     vocab.get('c')
     vocab.add('c', 5)
     # not in vocab yet, skips.
     vocab.add('b', 5)
     vocab.trim(7)
     vocab.freeze()
     self.assertEqual(vocab.get('b'), 0)
     self.assertEqual(vocab.get('c'), 0)
     self.assertEqual(len(vocab), 2)
Пример #5
0
 def testWordVocabulary(self):
     vocab = categorical_vocabulary.CategoricalVocabulary()
     self.assertEqual(vocab.get('a'), 1)
     self.assertEqual(vocab.get('b'), 2)
     self.assertEqual(vocab.get('a'), 1)
     self.assertEqual(vocab.get('b'), 2)