def _slot_matching_lambda(lhs_slot, rhs_slot): lhs_value = lhs_slot["text"] rhs_value = rhs_slot["rawValue"] if lhs_slot["entity"] != "snips/datetime": return lhs_value == rhs_value else: # Allow fuzzy matching when comparing datetimes lhs_tokens = tokenize_light(lhs_value, LANGUAGE_EN) rhs_tokens = tokenize_light(rhs_value, LANGUAGE_EN) if lhs_tokens and lhs_tokens[0].lower() in SKIPPED_DATE_PREFIXES: lhs_tokens = lhs_tokens[1:] if rhs_tokens and rhs_tokens[0].lower() in SKIPPED_DATE_PREFIXES: rhs_tokens = rhs_tokens[1:] return lhs_tokens == rhs_tokens
def generate_noise_utterances(augmented_utterances, num_intents, data_augmentation_config, language, random_state): if not augmented_utterances or not num_intents: return [] avg_num_utterances = len(augmented_utterances) / float(num_intents) if data_augmentation_config.unknown_words_replacement_string is not None: noise = generate_smart_noise( augmented_utterances, data_augmentation_config.unknown_words_replacement_string, language) else: noise = get_noises(language) noise_size = min( int(data_augmentation_config.noise_factor * avg_num_utterances), len(noise)) utterances_lengths = [ len(tokenize_light(get_text_from_chunks(u[DATA]), language)) for u in augmented_utterances ] mean_utterances_length = np.mean(utterances_lengths) std_utterances_length = np.std(utterances_lengths) noise_it = get_noise_it(noise, mean_utterances_length, std_utterances_length, random_state) # Remove duplicate 'unknowword unknowword' return [ UNKNOWNWORD_REGEX.sub(UNKNOWNWORD, next(noise_it)) for _ in range(noise_size) ]
def generate_noise_utterances(augmented_utterances, num_intents, data_augmentation_config, language, random_state): if not augmented_utterances or not num_intents: return [] avg_num_utterances = len(augmented_utterances) / float(num_intents) if data_augmentation_config.unknown_words_replacement_string is not None: noise = generate_smart_noise( augmented_utterances, data_augmentation_config.unknown_words_replacement_string, language) else: noise = get_noises(language) noise_size = min( int(data_augmentation_config.noise_factor * avg_num_utterances), len(noise)) utterances_lengths = [ len(tokenize_light(get_text_from_chunks(u[DATA]), language)) for u in augmented_utterances] mean_utterances_length = np.mean(utterances_lengths) std_utterances_length = np.std(utterances_lengths) noise_it = get_noise_it(noise, mean_utterances_length, std_utterances_length, random_state) # Remove duplicate 'unknowword unknowword' return [UNKNOWNWORD_REGEX.sub(UNKNOWNWORD, next(noise_it)) for _ in range(noise_size)]
def generate_smart_noise(augmented_utterances, replacement_string, language): text_utterances = [get_text_from_chunks(u[DATA]) for u in augmented_utterances] vocab = [w for u in text_utterances for w in tokenize_light(u, language)] vocab = set(vocab) noise = get_noises(language) return [w if w in vocab else replacement_string for w in noise]
def fit(self, dataset, utterances, classes): utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances) if not any(tokenize_light(q, self.language) for q in utterances_texts): return None utterances_to_features = _get_utterances_to_features_names( dataset, self.language) normalized_utterances_to_features = defaultdict(set) for k, v in iteritems(utterances_to_features): normalized_utterances_to_features[_normalize_stem( k, self.language)].update(v) if self.unknown_words_replacement_string is not None \ and self.unknown_words_replacement_string in \ normalized_utterances_to_features: normalized_utterances_to_features.pop( self.unknown_words_replacement_string) self.entity_utterances_to_feature_names = dict( normalized_utterances_to_features) preprocessed_utterances = self.preprocess_utterances(utterances) # pylint: disable=C0103 X_train_tfidf = self.tfidf_vectorizer.fit_transform( preprocessed_utterances) # pylint: enable=C0103 features_idx = { self.tfidf_vectorizer.vocabulary_[word]: word for word in self.tfidf_vectorizer.vocabulary_ } stop_words = get_stop_words(self.language) _, pval = chi2(X_train_tfidf, classes) self.best_features = [ i for i, v in enumerate(pval) if v < self.config.pvalue_threshold ] if not self.best_features: self.best_features = [ idx for idx, val in enumerate(pval) if val == pval.min() ] feature_names = {} for utterance_index in self.best_features: feature_names[utterance_index] = { "word": features_idx[utterance_index], "pval": pval[utterance_index] } for feat in feature_names: if feature_names[feat]["word"] in stop_words: if feature_names[feat]["pval"] > \ self.config.pvalue_threshold / 2.0: self.best_features.remove(feat) return self
def _preprocess_utterance(utterance, language, entity_utterances_to_features_names, word_clusters_name): utterance_tokens = tokenize_light(utterance, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in utterance_tokens ] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) builtin_entities = get_builtin_entities(utterance, language) entities_ranges = (e[RES_MATCH_RANGE] for e in sorted( builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START])) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove builtin entities from the utterance to avoid learning specific # examples such as '42' filtered_utterance = _remove_ranges(utterance, entities_ranges) filtered_utterance_tokens = tokenize_light(filtered_utterance, language) filtered_normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in filtered_utterance_tokens ] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def _preprocess_utterance(utterance, language, entity_utterances_to_features_names, word_clusters_name): utterance_tokens = tokenize_light(utterance, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [_normalize_stem(t, language) for t in utterance_tokens] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) builtin_entities = get_builtin_entities(utterance, language) entities_ranges = ( e[RES_MATCH_RANGE] for e in sorted(builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START]) ) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove builtin entities from the utterance to avoid learning specific # examples such as '42' filtered_utterance = _remove_ranges(utterance, entities_ranges) filtered_utterance_tokens = tokenize_light(filtered_utterance, language) filtered_normalized_stemmed_tokens = [_normalize_stem(t, language) for t in filtered_utterance_tokens] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def fit(self, dataset, utterances, classes): utterances_to_features = _get_utterances_to_features_names( dataset, self.language) normalized_utterances_to_features = defaultdict(set) for k, v in iteritems(utterances_to_features): normalized_utterances_to_features[ _normalize_stem(k, self.language)].update(v) if self.unknown_words_replacement_string is not None \ and self.unknown_words_replacement_string in \ normalized_utterances_to_features: normalized_utterances_to_features.pop( self.unknown_words_replacement_string) self.entity_utterances_to_feature_names = dict( normalized_utterances_to_features) if all(not "".join(tokenize_light(q, self.language)) for q in utterances): return None preprocessed_utterances = self.preprocess_utterances(utterances) # pylint: disable=C0103 X_train_tfidf = self.tfidf_vectorizer.fit_transform( preprocessed_utterances) # pylint: enable=C0103 list_index_words = { self.tfidf_vectorizer.vocabulary_[word]: word for word in self.tfidf_vectorizer.vocabulary_ } stop_words = get_stop_words(self.language) _, pval = chi2(X_train_tfidf, classes) self.best_features = [i for i, v in enumerate(pval) if v < self.config.pvalue_threshold] if not self.best_features: self.best_features = [idx for idx, val in enumerate(pval) if val == pval.min()] feature_names = {} for utterance_index in self.best_features: feature_names[utterance_index] = { "word": list_index_words[utterance_index], "pval": pval[utterance_index]} for feat in feature_names: if feature_names[feat]["word"] in stop_words: if feature_names[feat]["pval"] > \ self.config.pvalue_threshold / 2.0: self.best_features.remove(feat) return self
def _preprocess_query(query, language, entity_utterances_to_features_names): query_tokens = tokenize_light(query, language) word_clusters_features = _get_word_cluster_features(query_tokens, language) normalized_stemmed_tokens = [_normalize_stem(t, language) for t in query_tokens] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) features = get_default_sep(language).join(normalized_stemmed_tokens) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def _preprocess_query(query, language, entity_utterances_to_features_names): query_tokens = tokenize_light(query, language) word_clusters_features = _get_word_cluster_features(query_tokens, language) normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in query_tokens ] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) features = get_default_sep(language).join(normalized_stemmed_tokens) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def fit(self, dataset, queries, y): utterances_to_features = _get_utterances_to_features_names( dataset, self.language) normalized_utterances_to_features = defaultdict(set) for k, v in iteritems(utterances_to_features): normalized_utterances_to_features[_normalize_stem( k, self.language)].update(v) if self.unknown_words_replacement_string is not None \ and self.unknown_words_replacement_string in \ normalized_utterances_to_features: normalized_utterances_to_features.pop( self.unknown_words_replacement_string) self.entity_utterances_to_feature_names = dict( normalized_utterances_to_features) if all(not "".join(tokenize_light(q, self.language)) for q in queries): return None preprocessed_queries = self.preprocess_queries(queries) # pylint: disable=C0103 X_train_tfidf = self.tfidf_vectorizer.fit_transform( preprocessed_queries) # pylint: enable=C0103 list_index_words = { self.tfidf_vectorizer.vocabulary_[x]: x for x in self.tfidf_vectorizer.vocabulary_ } stop_words = get_stop_words(self.language) _, pval = chi2(X_train_tfidf, y) self.best_features = [ i for i, v in enumerate(pval) if v < self.pvalue_threshold ] if not self.best_features: self.best_features = [ idx for idx, val in enumerate(pval) if val == pval.min() ] feature_names = {} for i in self.best_features: feature_names[i] = {'word': list_index_words[i], 'pval': pval[i]} for feat in feature_names: if feature_names[feat]['word'] in stop_words: if feature_names[feat]['pval'] > self.pvalue_threshold / 2.0: self.best_features.remove(feat) return self
def _query_to_pattern(query, joined_entity_utterances, group_names_to_slot_names, language): pattern = [] for chunk in query[DATA]: if SLOT_NAME in chunk: max_index = _generate_new_index(group_names_to_slot_names) slot_name = chunk[SLOT_NAME] entity = chunk[ENTITY] group_names_to_slot_names[max_index] = slot_name pattern.append( r"(?P<%s>%s)" % (max_index, joined_entity_utterances[entity])) else: tokens = tokenize_light(chunk[TEXT], language) pattern += [regex_escape(t) for t in tokens] ignored_char_pattern = get_ignored_characters_pattern(language) pattern = r"^%s%s%s$" % (ignored_char_pattern, ignored_char_pattern.join(pattern), ignored_char_pattern) return pattern, group_names_to_slot_names
def _query_to_pattern(query, joined_entity_utterances, group_names_to_slot_names, language): pattern = [] for chunk in query[DATA]: if SLOT_NAME in chunk: max_index = _generate_new_index(group_names_to_slot_names) slot_name = chunk[SLOT_NAME] entity = chunk[ENTITY] group_names_to_slot_names[max_index] = slot_name pattern.append( r"(?P<%s>%s)" % (max_index, joined_entity_utterances[entity])) else: tokens = tokenize_light(chunk[TEXT], language) pattern += [regex_escape(t) for t in tokens] pattern = r"^%s%s%s$" % (WHITESPACE_PATTERN, WHITESPACE_PATTERN.join(pattern), WHITESPACE_PATTERN) return pattern, group_names_to_slot_names
def _get_joined_entity_utterances(dataset, language): joined_entity_utterances = dict() for entity_name, entity in iteritems(dataset[ENTITIES]): # matches are performed in a case insensitive manner utterances = set(u.lower() for u in entity[UTTERANCES]) patterns = [] for utterance in utterances: tokens = tokenize_light(utterance, language) pattern = WHITESPACE_PATTERN.join(regex_escape(t) for t in tokens) patterns.append(pattern) # We also add a placeholder value for builtin entities if is_builtin_entity(entity_name): placeholder = _get_entity_name_placeholder(entity_name, language) patterns.append(regex_escape(placeholder)) patterns = (p for p in patterns if p) joined_entity_utterances[entity_name] = r"|".join( sorted(patterns, key=len, reverse=True)) return joined_entity_utterances
def get_string_variations(string, language): variations = {string} variations.update(flatten(case_variations(v) for v in variations)) variations.update(flatten(normalization_variations(v) for v in variations)) # We re-generate case variations as normalization can produce new # variations variations.update(flatten(case_variations(v) for v in variations)) variations.update(flatten(and_variations(v, language) for v in variations)) variations.update( flatten(punctuation_variations(v, language) for v in variations)) variations.update( flatten(numbers_variations(v, language) for v in variations)) # Add single space variations single_space_variations = set(" ".join(v.split()) for v in variations) variations.update(single_space_variations) # Add tokenized variations tokenized_variations = set( get_default_sep(language).join(tokenize_light(v, language)) for v in variations) variations.update(tokenized_variations) return variations
def _preprocess_utterance(utterance, language, entity_utterances_to_features_names, word_clusters_name): utterance_text = get_text_from_chunks(utterance[DATA]) utterance_tokens = tokenize_light(utterance_text, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in utterance_tokens ] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) builtin_entities = get_builtin_entities(utterance_text, language, use_cache=True) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove values of builtin slots from the utterance to avoid learning # specific samples such as '42' or 'tomorrow' filtered_normalized_stemmed_tokens = [ _normalize_stem(chunk[TEXT], language) for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY]) ] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def has_any_capitalization(entity_utterances, language): for utterance in entity_utterances: tokens = tokenize_light(utterance, language) if any(t.isupper() or t.istitle() for t in tokens): return True return False
def _get_entity_name_placeholder(entity_label, language): return "%%%s%%" % "".join( tokenize_light(entity_label, language)).upper()
def capitalize(text, language): tokens = tokenize_light(text, language) stop_words = get_stop_words(language) return get_default_sep(language).join( t.title() if t.lower() not in stop_words else t.lower() for t in tokens)
def _get_tfidf_vectorizer(language, extra_args=None): if extra_args is None: extra_args = dict() return TfidfVectorizer(tokenizer=lambda x: tokenize_light(x, language), **extra_args)
def stem_function(text, language): return get_default_sep(language).join( [_stem(t) for t in tokenize_light(text, language)])
def _get_tfidf_vectorizer(language, sublinear_tf=False): return TfidfVectorizer(tokenizer=lambda x: tokenize_light(x, language), sublinear_tf=sublinear_tf)
def _entity_name_to_feature(entity_name, language): return "entityfeature%s" % "".join( tokenize_light(entity_name, language=language))
def _get_builtin_entity_name(entity_label, language): return "%%%s%%" % "".join( tokenize_light(entity_label, language)).upper()
def _entity_name_to_feature(entity_name, language): return "entityfeature%s" % "".join(tokenize_light( entity_name, language=language))
def stem(string, language): tokens = tokenize_light(string, language) stemmed_tokens = [_stem(token, language) for token in tokens] return ' '.join(stemmed_tokens)
def _builtin_entity_to_feature(builtin_entity_label, language): return "builtinentityfeature%s" % "".join( tokenize_light(builtin_entity_label, language=language))
def _get_builtin_entity_name(entity_label, language): return "%%%s%%" % "".join(tokenize_light(entity_label, language)).upper()
def _builtin_entity_to_feature(builtin_entity_label, language): return "builtinentityfeature%s" % "".join(tokenize_light( builtin_entity_label, language=language))