def compute_features(self, tokens, drop_out=False): """Compute features on the provided tokens The *drop_out* parameters allows to activate drop out on features that have a positive drop out ratio. This should only be used during training. """ if resource_exists(self.language, STEMS): tokens = [ Token(t.value, t.start, t.end, stem=stem(t.normalized_value, self.language)) for t in tokens] else: tokens = [Token(t.value, t.start, t.end, stem=t.normalized_value) for t in tokens] cache = [{TOKEN_NAME: token} for token in tokens] features = [] random_state = check_random_state(self.config.random_seed) for i in range(len(tokens)): token_features = UnupdatableDict() for feature in self.features: f_drop_out = feature.drop_out if drop_out and random_state.rand() < f_drop_out: continue value = feature.compute(i, cache) if value is not None: token_features[feature.name] = value features.append(token_features) return features
def _normalize_stem(text, language): normalized_stemmed = normalize(text) try: normalized_stemmed = stem(normalized_stemmed, language) except MissingResource: pass return normalized_stemmed
def _normalize_stem(text, language): normalized_stemmed = normalize(text) try: normalized_stemmed = stem(normalized_stemmed, language) except UnknownResource: pass return normalized_stemmed
def _stem_entity_utterances(entity_utterances, language, resources): values = dict() # Sort by resolved value, so that values conflict in a deterministic way for raw_value, resolved_value in sorted(iteritems(entity_utterances), key=operator.itemgetter(1)): stemmed_value = stem(raw_value, language, resources) if stemmed_value not in values: values[stemmed_value] = resolved_value return values
def language(self, value): if value is not None: self._language = value self.args["language_code"] = self.language if self.common_words_gazetteer_name is not None: gazetteer = get_gazetteer(self.language, self.common_words_gazetteer_name) if self.use_stemming: gazetteer = set(stem(w, self.language) for w in gazetteer) self.gazetteer = gazetteer
def _normalize_stem(text, language, use_stemming): if use_stemming: return stem(text, language) return normalize(text)
def _normalize_stem(text, language, resources, use_stemming): if use_stemming: return stem(text, language, resources) return normalize(text)
def preprocess(string): normalized = normalize(string) if resource_exists(self.language, STEMS) and self.use_stemming: return stem(normalized, self.language) return normalized
def preprocess(string): normalized = normalize(string) return stem(normalized, self.language) if self.use_stemming \ else normalized
def _stem_entity_utterances(entity_utterances, language): return { stem(raw_value, language): resolved_value for raw_value, resolved_value in iteritems(entity_utterances) }
def _normalize_stem(text, language, resources, use_stemming): from snips_nlu_utils import normalize if use_stemming: return stem(text, language, resources) return normalize(text)
def preprocess(string): normalized = normalize(string) if self.use_stemming: return stem(normalized, self.language) return normalized