def compute_feature(self, tokens, token_index): if self.use_stemming: value = stem_token(tokens[token_index], self.language) else: value = normalize_token(tokens[token_index]) cluster = get_word_clusters(self.language)[self.cluster_name] return cluster.get(value, None)
def compute_feature(self, tokens, token_index): max_len = len(tokens) end = token_index + self.n if 0 <= token_index < max_len and end <= max_len: if self.gazetteer is None: if self.use_stemming: stems = (stem_token(t, self.language) for t in tokens[token_index:end]) return get_default_sep(self.language).join(stems) normalized_values = (normalize_token(t) for t in tokens[token_index:end]) return get_default_sep(self.language).join(normalized_values) words = [] for t in tokens[token_index:end]: if self.use_stemming: value = stem_token(t, self.language) else: value = normalize_token(t) words.append(value if value in self.gazetteer else "rare_word") return get_default_sep(self.language).join(words) return None
def _transform(self, tokens): if self.use_stemming: light_tokens = (stem_token(t, self.language) for t in tokens) else: light_tokens = (normalize_token(t) for t in tokens) current_index = 0 transformed_tokens = [] for light_token in light_tokens: transformed_token = Token(value=light_token, start=current_index, end=current_index + len(light_token)) transformed_tokens.append(transformed_token) current_index = transformed_token.end + 1 return transformed_tokens
def compute_feature(self, tokens, token_index): if self.use_stemming: value = stem_token(tokens[token_index], self.resources) else: value = normalize_token(tokens[token_index]) return self.cluster.get(value, None)
def _transform(self, token): if self.use_stemming: return stem_token(token, self.language) return normalize_token(token)