Пример #1
0
 def test_all_languages_should_have_stop_words(self):
     # The capitalization for the CRF assumes all languages have stop_words
     # Given
     for language in get_all_languages():
         try:
             # When/Then
             get_stop_words(language)
         except:  # pylint: disable=W0702
             self.fail("%s has not stop words" % language)
Пример #2
0
 def test_all_languages_should_have_stop_words(self):
     # The capitalization for the CRF assumes all languages have stop_words
     # Given
     for language in get_all_languages():
         try:
             # When/Then
             get_stop_words(language)
         except:  # pylint: disable=W0702
             self.fail("%s has not stop words" % language)
 def language(self, value):
     self._language = value
     if value is None:
         self._stop_words = None
     else:
         if self.config.ignore_stop_words:
             self._stop_words = get_stop_words(self.resources)
         else:
             self._stop_words = set()
Пример #4
0
    def fit(self, dataset, utterances, classes):
        utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances)
        if not any(tokenize_light(q, self.language) for q in utterances_texts):
            return None

        utterances_to_features = _get_utterances_to_features_names(
            dataset, self.language)
        normalized_utterances_to_features = defaultdict(set)
        for k, v in iteritems(utterances_to_features):
            normalized_utterances_to_features[_normalize_stem(
                k, self.language)].update(v)
        if self.unknown_words_replacement_string is not None \
                and self.unknown_words_replacement_string in \
                normalized_utterances_to_features:
            normalized_utterances_to_features.pop(
                self.unknown_words_replacement_string)
        self.entity_utterances_to_feature_names = dict(
            normalized_utterances_to_features)

        preprocessed_utterances = self.preprocess_utterances(utterances)
        # pylint: disable=C0103
        X_train_tfidf = self.tfidf_vectorizer.fit_transform(
            preprocessed_utterances)
        # pylint: enable=C0103
        features_idx = {
            self.tfidf_vectorizer.vocabulary_[word]: word
            for word in self.tfidf_vectorizer.vocabulary_
        }

        stop_words = get_stop_words(self.language)

        _, pval = chi2(X_train_tfidf, classes)
        self.best_features = [
            i for i, v in enumerate(pval) if v < self.config.pvalue_threshold
        ]
        if not self.best_features:
            self.best_features = [
                idx for idx, val in enumerate(pval) if val == pval.min()
            ]

        feature_names = {}
        for utterance_index in self.best_features:
            feature_names[utterance_index] = {
                "word": features_idx[utterance_index],
                "pval": pval[utterance_index]
            }

        for feat in feature_names:
            if feature_names[feat]["word"] in stop_words:
                if feature_names[feat]["pval"] > \
                        self.config.pvalue_threshold / 2.0:
                    self.best_features.remove(feat)

        return self
Пример #5
0
    def fit(self, dataset, utterances, classes):
        utterances_to_features = _get_utterances_to_features_names(
            dataset, self.language)
        normalized_utterances_to_features = defaultdict(set)
        for k, v in iteritems(utterances_to_features):
            normalized_utterances_to_features[
                _normalize_stem(k, self.language)].update(v)
        if self.unknown_words_replacement_string is not None \
                and self.unknown_words_replacement_string in \
                normalized_utterances_to_features:
            normalized_utterances_to_features.pop(
                self.unknown_words_replacement_string)
        self.entity_utterances_to_feature_names = dict(
            normalized_utterances_to_features)

        if all(not "".join(tokenize_light(q, self.language)) for q in
               utterances):
            return None
        preprocessed_utterances = self.preprocess_utterances(utterances)
        # pylint: disable=C0103
        X_train_tfidf = self.tfidf_vectorizer.fit_transform(
            preprocessed_utterances)
        # pylint: enable=C0103
        list_index_words = {
            self.tfidf_vectorizer.vocabulary_[word]: word
            for word in self.tfidf_vectorizer.vocabulary_
        }

        stop_words = get_stop_words(self.language)

        _, pval = chi2(X_train_tfidf, classes)
        self.best_features = [i for i, v in enumerate(pval) if
                              v < self.config.pvalue_threshold]
        if not self.best_features:
            self.best_features = [idx for idx, val in enumerate(pval) if
                                  val == pval.min()]

        feature_names = {}
        for utterance_index in self.best_features:
            feature_names[utterance_index] = {
                "word": list_index_words[utterance_index],
                "pval": pval[utterance_index]}

        for feat in feature_names:
            if feature_names[feat]["word"] in stop_words:
                if feature_names[feat]["pval"] > \
                        self.config.pvalue_threshold / 2.0:
                    self.best_features.remove(feat)

        return self
Пример #6
0
 def _extract_word_pairs(self, utterance):
     if self.config.filter_stop_words:
         stop_words = get_stop_words(self.resources)
         utterance = [t for t in utterance if t not in stop_words]
     pairs = set()
     for j, w1 in enumerate(utterance):
         max_index = None
         if self.config.window_size is not None:
             max_index = j + self.config.window_size + 1
         for w2 in utterance[j + 1:max_index]:
             key = (w1, w2)
             if not self.config.keep_order:
                 key = tuple(sorted(key))
             pairs.add(key)
     return pairs
Пример #7
0
    def fit(self, dataset, queries, y):
        utterances_to_features = _get_utterances_to_features_names(
            dataset, self.language)
        normalized_utterances_to_features = defaultdict(set)
        for k, v in iteritems(utterances_to_features):
            normalized_utterances_to_features[_normalize_stem(
                k, self.language)].update(v)
        if self.unknown_words_replacement_string is not None \
                and self.unknown_words_replacement_string in \
                normalized_utterances_to_features:
            normalized_utterances_to_features.pop(
                self.unknown_words_replacement_string)
        self.entity_utterances_to_feature_names = dict(
            normalized_utterances_to_features)

        if all(not "".join(tokenize_light(q, self.language)) for q in queries):
            return None
        preprocessed_queries = self.preprocess_queries(queries)
        # pylint: disable=C0103
        X_train_tfidf = self.tfidf_vectorizer.fit_transform(
            preprocessed_queries)
        # pylint: enable=C0103
        list_index_words = {
            self.tfidf_vectorizer.vocabulary_[x]: x
            for x in self.tfidf_vectorizer.vocabulary_
        }

        stop_words = get_stop_words(self.language)

        _, pval = chi2(X_train_tfidf, y)
        self.best_features = [
            i for i, v in enumerate(pval) if v < self.pvalue_threshold
        ]
        if not self.best_features:
            self.best_features = [
                idx for idx, val in enumerate(pval) if val == pval.min()
            ]

        feature_names = {}
        for i in self.best_features:
            feature_names[i] = {'word': list_index_words[i], 'pval': pval[i]}

        for feat in feature_names:
            if feature_names[feat]['word'] in stop_words:
                if feature_names[feat]['pval'] > self.pvalue_threshold / 2.0:
                    self.best_features.remove(feat)

        return self
Пример #8
0
    def fit(self, dataset, utterances, classes):
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)

        utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances)
        if not any(tokenize_light(q, self.language) for q in utterances_texts):
            return None

        preprocessed_utterances = self.preprocess_utterances(utterances)
        # pylint: disable=C0103
        X_train_tfidf = self.tfidf_vectorizer.fit_transform(
            preprocessed_utterances)
        # pylint: enable=C0103
        features_idx = {self.tfidf_vectorizer.vocabulary_[word]: word for word
                        in self.tfidf_vectorizer.vocabulary_}

        stop_words = get_stop_words(self.language)

        _, pval = chi2(X_train_tfidf, classes)
        self.best_features = [i for i, v in enumerate(pval) if
                              v < self.config.pvalue_threshold]
        if not self.best_features:
            self.best_features = [idx for idx, val in enumerate(pval) if
                                  val == pval.min()]

        feature_names = {}
        for utterance_index in self.best_features:
            feature_names[utterance_index] = {
                "word": features_idx[utterance_index],
                "pval": pval[utterance_index]}

        for feat in feature_names:
            if feature_names[feat]["word"] in stop_words:
                if feature_names[feat]["pval"] > \
                        self.config.pvalue_threshold / 2.0:
                    self.best_features.remove(feat)

        return self
Пример #9
0
def capitalize(text, language, resources):
    tokens = tokenize_light(text, language)
    stop_words = get_stop_words(resources)
    return get_default_sep(language).join(
        t.title() if t.lower() not in stop_words else t.lower()
        for t in tokens)
Пример #10
0
def capitalize(text, language):
    tokens = tokenize_light(text, language)
    stop_words = get_stop_words(language)
    return get_default_sep(language).join(
        t.title() if t.lower() not in stop_words
        else t.lower() for t in tokens)