def _slot_matching_lambda(lhs_slot, rhs_slot): lhs_value = lhs_slot["text"] rhs_value = rhs_slot["rawValue"] if lhs_slot["entity"] != "snips/datetime": return lhs_value == rhs_value else: # Allow fuzzy matching when comparing datetimes lhs_tokens = tokenize_light(lhs_value, LANGUAGE_EN) rhs_tokens = tokenize_light(rhs_value, LANGUAGE_EN) if lhs_tokens and lhs_tokens[0].lower() in SKIPPED_DATE_PREFIXES: lhs_tokens = lhs_tokens[1:] if rhs_tokens and rhs_tokens[0].lower() in SKIPPED_DATE_PREFIXES: rhs_tokens = rhs_tokens[1:] return lhs_tokens == rhs_tokens
def generate_noise_utterances(augmented_utterances, noise, num_intents, data_augmentation_config, language, random_state): if not augmented_utterances or not num_intents: return [] avg_num_utterances = len(augmented_utterances) / float(num_intents) if data_augmentation_config.unknown_words_replacement_string is not None: noise = generate_smart_noise( noise, augmented_utterances, data_augmentation_config.unknown_words_replacement_string, language) noise_size = min( int(data_augmentation_config.noise_factor * avg_num_utterances), len(noise)) utterances_lengths = [ len(tokenize_light(get_text_from_chunks(u[DATA]), language)) for u in augmented_utterances ] mean_utterances_length = np.mean(utterances_lengths) std_utterances_length = np.std(utterances_lengths) noise_it = get_noise_it(noise, mean_utterances_length, std_utterances_length, random_state) # Remove duplicate 'unknownword unknownword' return [ text_to_utterance(UNKNOWNWORD_REGEX.sub(UNKNOWNWORD, next(noise_it))) for _ in range(noise_size) ]
def _utterance_to_pattern(self, utterance, stop_words, entity_placeholders): from snips_nlu_utils import normalize slot_names_count = defaultdict(int) pattern = [] for chunk in utterance[DATA]: if SLOT_NAME in chunk: slot_name = chunk[SLOT_NAME] slot_names_count[slot_name] += 1 group_name = self.slot_names_to_group_names[slot_name] count = slot_names_count[slot_name] if count > 1: group_name = "%s_%s" % (group_name, count) placeholder = entity_placeholders[chunk[ENTITY]] pattern.append(r"(?P<%s>%s)" % (group_name, placeholder)) else: tokens = tokenize_light(chunk[TEXT], self.language) pattern += [ regex_escape(t.lower()) for t in tokens if normalize(t) not in stop_words ] pattern = r"^%s%s%s$" % (WHITESPACE_PATTERN, WHITESPACE_PATTERN.join(pattern), WHITESPACE_PATTERN) return pattern
def _init_vectorizer(self, language): from sklearn.feature_extraction.text import (TfidfVectorizer as SklearnTfidfVectorizer) self._tfidf_vectorizer = SklearnTfidfVectorizer( tokenizer=lambda x: tokenize_light(x, language)) return self
def generate_smart_noise(noise, augmented_utterances, replacement_string, language): text_utterances = [get_text_from_chunks(u[DATA]) for u in augmented_utterances] vocab = [w for u in text_utterances for w in tokenize_light(u, language)] vocab = set(vocab) return [w if w in vocab else replacement_string for w in noise]
def _preprocess_text(self, txt, intent): """Replaces stop words and characters that are tokenized out by whitespaces""" stop_words = self._get_intent_stop_words(intent) tokens = tokenize_light(txt, self.language) cleaned_string = " ".join( [tkn for tkn in tokens if normalize(tkn) not in stop_words]) return cleaned_string.lower()
def fit(self, dataset, utterances, classes): utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances) if not any(tokenize_light(q, self.language) for q in utterances_texts): return None utterances_to_features = _get_utterances_to_features_names( dataset, self.language) normalized_utterances_to_features = defaultdict(set) for k, v in iteritems(utterances_to_features): normalized_utterances_to_features[_normalize_stem( k, self.language)].update(v) if self.unknown_words_replacement_string is not None \ and self.unknown_words_replacement_string in \ normalized_utterances_to_features: normalized_utterances_to_features.pop( self.unknown_words_replacement_string) self.entity_utterances_to_feature_names = dict( normalized_utterances_to_features) preprocessed_utterances = self.preprocess_utterances(utterances) # pylint: disable=C0103 X_train_tfidf = self.tfidf_vectorizer.fit_transform( preprocessed_utterances) # pylint: enable=C0103 features_idx = { self.tfidf_vectorizer.vocabulary_[word]: word for word in self.tfidf_vectorizer.vocabulary_ } stop_words = get_stop_words(self.language) _, pval = chi2(X_train_tfidf, classes) self.best_features = [ i for i, v in enumerate(pval) if v < self.config.pvalue_threshold ] if not self.best_features: self.best_features = [ idx for idx, val in enumerate(pval) if val == pval.min() ] feature_names = {} for utterance_index in self.best_features: feature_names[utterance_index] = { "word": features_idx[utterance_index], "pval": pval[utterance_index] } for feat in feature_names: if feature_names[feat]["word"] in stop_words: if feature_names[feat]["pval"] > \ self.config.pvalue_threshold / 2.0: self.best_features.remove(feat) return self
def _enrich_utterance(self, x, builtin_ents, custom_ents): utterance = get_text_from_chunks(x[DATA]) all_entities = builtin_ents + custom_ents placeholder_fn = self._placeholder_fn # Replace entities with placeholders enriched_utterance = replace_entities_with_placeholders( utterance, all_entities, placeholder_fn)[1] # Tokenize enriched_utterance = tokenize_light(enriched_utterance, self.language) # Remove the unknownword strings if needed if self.config.unknown_words_replacement_string: enriched_utterance = [ t for t in enriched_utterance if t != self.config.unknown_words_replacement_string ] return enriched_utterance
def from_path(cls, path, **shared): import numpy as np import scipy.sparse as sp from sklearn.feature_extraction.text import (TfidfTransformer, TfidfVectorizer as SklearnTfidfVectorizer) path = Path(path) model_path = path / "vectorizer.json" if not model_path.exists(): raise LoadingError("Missing vectorizer model file: %s" % model_path.name) with model_path.open("r", encoding="utf-8") as f: vectorizer_dict = json.load(f) vectorizer = cls(vectorizer_dict["config"], **shared) vectorizer._language = vectorizer_dict["language_code"] builtin_entity_scope = vectorizer_dict["builtin_entity_scope"] if builtin_entity_scope is not None: builtin_entity_scope = set(builtin_entity_scope) vectorizer.builtin_entity_scope = builtin_entity_scope vectorizer_ = vectorizer_dict["vectorizer"] if vectorizer_: vocab = vectorizer_["vocab"] idf_diag_data = vectorizer_["idf_diag"] idf_diag_data = np.array(idf_diag_data) idf_diag_shape = (len(idf_diag_data), len(idf_diag_data)) row = list(range(idf_diag_shape[0])) col = list(range(idf_diag_shape[0])) idf_diag = sp.csr_matrix((idf_diag_data, (row, col)), shape=idf_diag_shape) tfidf_transformer = TfidfTransformer() tfidf_transformer._idf_diag = idf_diag vectorizer_ = SklearnTfidfVectorizer( tokenizer=lambda x: tokenize_light(x, vectorizer._language)) vectorizer_.vocabulary_ = vocab vectorizer_._tfidf = tfidf_transformer vectorizer._tfidf_vectorizer = vectorizer_ return vectorizer
def _create_custom_entity_parser_configuration( entities, stopwords_fraction, language): """Dynamically creates the gazetteer parser configuration. Args: entities (dict): entity for the dataset stopwords_fraction (float): fraction of the vocabulary of the entity values that will be considered as stop words ( the top n_vocabulary * stopwords_fraction most frequent words will be considered stop words) language (str): language of the entities Returns: the parser configuration as dictionary """ if not 0 < stopwords_fraction < 1: raise ValueError("stopwords_fraction must be in ]0.0, 1.0[") parser_configurations = [] for entity_name, entity in sorted(iteritems(entities)): vocabulary = set( t for raw_value in entity[UTTERANCES] for t in tokenize_light(raw_value, language) ) num_stopwords = int(stopwords_fraction * len(vocabulary)) config = { "entity_identifier": entity_name, "entity_parser": { "threshold": entity[MATCHING_STRICTNESS], "n_gazetteer_stop_words": num_stopwords, "gazetteer": [ { "raw_value": k, "resolved_value": v } for k, v in sorted(iteritems(entity[UTTERANCES])) ] } } if LICENSE_INFO in entity: config["entity_parser"][LICENSE_INFO] = entity[LICENSE_INFO] parser_configurations.append(config) configuration = { "entity_parsers": parser_configurations } return configuration
def _query_to_pattern(query, joined_entity_utterances, group_names_to_slot_names, language): pattern = [] for chunk in query[DATA]: if SLOT_NAME in chunk: max_index = _generate_new_index(group_names_to_slot_names) slot_name = chunk[SLOT_NAME] entity = chunk[ENTITY] group_names_to_slot_names[max_index] = slot_name pattern.append( r"(?P<%s>%s)" % (max_index, joined_entity_utterances[entity])) else: tokens = tokenize_light(chunk[TEXT], language) pattern += [regex_escape(t) for t in tokens] pattern = r"^%s%s%s$" % (WHITESPACE_PATTERN, WHITESPACE_PATTERN.join(pattern), WHITESPACE_PATTERN) return pattern, group_names_to_slot_names
def fit_transform(self, dataset, utterances, classes, none_class): dataset = validate_and_format_dataset(dataset) self.language = dataset[LANGUAGE] utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances) if not any(tokenize_light(q, self.language) for q in utterances_texts): raise _EmptyDatasetUtterancesError( "Tokenized utterances are empty") x_tfidf = self._fit_transform_tfidf_vectorizer(utterances, classes, dataset) x = x_tfidf if self.config.added_cooccurrence_feature_ratio: self._fit_cooccurrence_vectorizer(utterances, classes, none_class, dataset) x_cooccurrence = self.cooccurrence_vectorizer.transform(utterances) x = sp.hstack((x_tfidf, x_cooccurrence)) return x
def _preprocess_utterance(utterance, language, builtin_entity_parser, custom_entity_parser, word_clusters_name, use_stemming, unknownword_replacement_string): utterance_text = get_text_from_chunks(utterance[DATA]) utterance_tokens = tokenize_light(utterance_text, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [_normalize_stem(t, language, use_stemming) for t in utterance_tokens] custom_entities = custom_entity_parser.parse( " ".join(normalized_stemmed_tokens)) custom_entities = [e for e in custom_entities if e["value"] != unknownword_replacement_string] custom_entities_features = [ _entity_name_to_feature(e[ENTITY_KIND], language) for e in custom_entities] builtin_entities = builtin_entity_parser.parse( utterance_text, use_cache=True) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove values of builtin slots from the utterance to avoid learning # specific samples such as '42' or 'tomorrow' filtered_normalized_stemmed_tokens = [ _normalize_stem(chunk[TEXT], language, use_stemming) for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY]) ] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if custom_entities_features: features += " " + " ".join(sorted(custom_entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def _get_joined_entity_utterances(dataset, language): joined_entity_utterances = dict() for entity_name, entity in iteritems(dataset[ENTITIES]): # matches are performed in a case insensitive manner utterances = set(u.lower() for u in entity[UTTERANCES]) patterns = [] if is_builtin_entity(entity_name): # We add a placeholder value for builtin entities placeholder = _get_entity_name_placeholder(entity_name, language) patterns.append(regex_escape(placeholder)) else: for utterance in utterances: tokens = tokenize_light(utterance, language) pattern = WHITESPACE_PATTERN.join(regex_escape(t) for t in tokens) patterns.append(pattern) patterns = (p for p in patterns if p) joined_entity_utterances[entity_name] = r"|".join( sorted(patterns, key=len, reverse=True)) return joined_entity_utterances
def get_string_variations(string, language): variations = {string} variations.update(flatten(case_variations(v) for v in variations)) variations.update(flatten(normalization_variations(v) for v in variations)) # We re-generate case variations as normalization can produce new # variations variations.update(flatten(case_variations(v) for v in variations)) variations.update(flatten(and_variations(v, language) for v in variations)) variations.update( flatten(punctuation_variations(v, language) for v in variations)) variations.update( flatten(numbers_variations(v, language) for v in variations)) # Add single space variations single_space_variations = set(" ".join(v.split()) for v in variations) variations.update(single_space_variations) # Add tokenized variations tokenized_variations = set( get_default_sep(language).join(tokenize_light(v, language)) for v in variations) variations.update(tokenized_variations) return variations
def _preprocess(self, utterances, training=False): normalized_utterances = deepcopy(utterances) for u in normalized_utterances: for chunk in u[DATA]: chunk[TEXT] = _normalize_stem(chunk[TEXT], self.language, self.resources, self.config.use_stemming) if training: builtin_ents, custom_ents = zip( *[_entities_from_utterance(u) for u in utterances]) else: # Extract builtin entities on unormalized utterances builtin_ents = [ self.builtin_entity_parser.parse(get_text_from_chunks(u[DATA]), self.builtin_entity_scope, use_cache=True) for u in utterances ] # Extract builtin entities on normalized utterances custom_ents = [ self.custom_entity_parser.parse(get_text_from_chunks(u[DATA]), use_cache=True) for u in normalized_utterances ] if self.config.word_clusters_name: # Extract world clusters on unormalized utterances original_utterances_text = [ get_text_from_chunks(u[DATA]) for u in utterances ] w_clusters = [ _get_word_cluster_features( tokenize_light(u.lower(), self.language), self.config.word_clusters_name, self.resources) for u in original_utterances_text ] else: w_clusters = [None for _ in normalized_utterances] return normalized_utterances, builtin_ents, custom_ents, w_clusters
def fit(self, dataset, utterances, classes): self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances) if not any(tokenize_light(q, self.language) for q in utterances_texts): return None preprocessed_utterances = self.preprocess_utterances(utterances) # pylint: disable=C0103 X_train_tfidf = self.tfidf_vectorizer.fit_transform( preprocessed_utterances) # pylint: enable=C0103 features_idx = {self.tfidf_vectorizer.vocabulary_[word]: word for word in self.tfidf_vectorizer.vocabulary_} stop_words = get_stop_words(self.language) _, pval = chi2(X_train_tfidf, classes) self.best_features = [i for i, v in enumerate(pval) if v < self.config.pvalue_threshold] if not self.best_features: self.best_features = [idx for idx, val in enumerate(pval) if val == pval.min()] feature_names = {} for utterance_index in self.best_features: feature_names[utterance_index] = { "word": features_idx[utterance_index], "pval": pval[utterance_index]} for feat in feature_names: if feature_names[feat]["word"] in stop_words: if feature_names[feat]["pval"] > \ self.config.pvalue_threshold / 2.0: self.best_features.remove(feat) return self
def _preprocess_utterance(utterance, language, entity_utterances_to_features_names, word_clusters_name): utterance_text = get_text_from_chunks(utterance[DATA]) utterance_tokens = tokenize_light(utterance_text, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in utterance_tokens ] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) builtin_entities = get_builtin_entities(utterance_text, language, use_cache=True) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove values of builtin slots from the utterance to avoid learning # specific samples such as '42' or 'tomorrow' filtered_normalized_stemmed_tokens = [ _normalize_stem(chunk[TEXT], language) for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY]) ] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def _builtin_entity_to_feature(builtin_entity_label, language): return "builtinentityfeature%s" % "".join(tokenize_light( builtin_entity_label.lower(), language))
def _entity_name_to_feature(entity_name, language): return "entityfeature%s" % "".join(tokenize_light( entity_name.lower(), language))
def _get_tfidf_vectorizer(language, sublinear_tf=False): return TfidfVectorizer(tokenizer=lambda x: tokenize_light(x, language), sublinear_tf=sublinear_tf)
def _get_entity_name_placeholder(entity_label, language): return "%%%s%%" % "".join(tokenize_light(entity_label, language)).upper()
def _has_any_capitalization(entity_utterances, language): for utterance in entity_utterances: tokens = tokenize_light(utterance, language) if any(t.isupper() or t.istitle() for t in tokens): return True return False
def stem_function(text, language): return get_default_sep(language).join( [_stem(t) for t in tokenize_light(text, language)])
def capitalize(text, language, resources): tokens = tokenize_light(text, language) stop_words = get_stop_words(resources) return get_default_sep(language).join( t.title() if t.lower() not in stop_words else t.lower() for t in tokens)
def _placeholder_fn(self, entity_name): return "".join(tokenize_light(str(entity_name), str(self.language))).upper()
def placeholder_fn(x): return "%%%s%%" % "".join(tokenize_light(x, "en")).upper()
def _init_vectorizer(self, language): self._tfidf_vectorizer = SklearnTfidfVectorizer( tokenizer=lambda x: tokenize_light(x, language)) return self
variations.update(flatten(normalization_variations(v) for v in variations)) # We re-generate case variations as normalization can produce new # variations if case: variations.update(flatten(case_variations(v) for v in variations)) if and_: variations.update( flatten(and_variations(v, language) for v in variations)) if punctuation: variations.update( flatten(punctuation_variations(v, language) for v in variations)) # Special case of number variation which are long to generate due to the # BuilinEntityParser running on each variation if numbers: variations.update( flatten( numbers_variations(v, language, builtin_entity_parser) for v in variations)) # Add single space variations single_space_variations = set(" ".join(v.split()) for v in variations) variations.update(single_space_variations) # Add tokenized variations tokenized_variations = set( get_default_sep(language).join(tokenize_light(v, language)) for v in variations) variations.update(tokenized_variations) return variations