示例#1
0
    def _fit_transform_tfidf_vectorizer(self, x, y, dataset):
        self.tfidf_vectorizer = TfidfVectorizer(
            config=self.config.tfidf_vectorizer_config,
            builtin_entity_parser=self.builtin_entity_parser,
            custom_entity_parser=self.custom_entity_parser,
            resources=self.resources)
        x_tfidf = self.tfidf_vectorizer.fit_transform(x, dataset)

        if not self.tfidf_vectorizer.vocabulary:
            raise _EmptyDatasetUtterancesError(
                "Dataset is empty or with empty utterances")
        _, tfidf_pval = chi2(x_tfidf, y)
        best_tfidf_features = set(i for i, v in enumerate(tfidf_pval)
                                  if v < self.config.pvalue_threshold)
        if not best_tfidf_features:
            best_tfidf_features = set(idx for idx, val in enumerate(tfidf_pval)
                                      if val == tfidf_pval.min())

        best_ngrams = [
            ng for ng, i in iteritems(self.tfidf_vectorizer.vocabulary)
            if i in best_tfidf_features
        ]
        self.tfidf_vectorizer.limit_vocabulary(best_ngrams)
        # We can't return x_tfidf[:best_tfidf_features] because of the
        # normalization in the transform of the tfidf_vectorizer
        # this would lead to inconsistent result between: fit_transform(x, y)
        # and fit(x, y).transform(x)
        return self.tfidf_vectorizer.transform(x)
示例#2
0
    def fit_transform(self, dataset, utterances, classes, none_class):
        dataset = validate_and_format_dataset(dataset)
        self.language = dataset[LANGUAGE]

        utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances)
        if not any(tokenize_light(q, self.language) for q in utterances_texts):
            raise _EmptyDatasetUtterancesError(
                "Tokenized utterances are empty")

        x_tfidf = self._fit_transform_tfidf_vectorizer(utterances, classes,
                                                       dataset)
        x = x_tfidf
        if self.config.added_cooccurrence_feature_ratio:
            self._fit_cooccurrence_vectorizer(utterances, classes, none_class,
                                              dataset)
            x_cooccurrence = self.cooccurrence_vectorizer.transform(utterances)
            x = sp.hstack((x_tfidf, x_cooccurrence))

        return x