Exemplo n.º 1
0
 def compute_feature(self, tokens, token_index):
     if self.use_stemming:
         value = stem_token(tokens[token_index], self.language)
     else:
         value = normalize_token(tokens[token_index])
     cluster = get_word_clusters(self.language)[self.cluster_name]
     return cluster.get(value, None)
Exemplo n.º 2
0
 def compute_feature(self, tokens, token_index):
     max_len = len(tokens)
     end = token_index + self.n
     if 0 <= token_index < max_len and end <= max_len:
         if self.gazetteer is None:
             if self.use_stemming:
                 stems = (stem_token(t, self.language)
                          for t in tokens[token_index:end])
                 return get_default_sep(self.language).join(stems)
             normalized_values = (normalize_token(t)
                                  for t in tokens[token_index:end])
             return get_default_sep(self.language).join(normalized_values)
         words = []
         for t in tokens[token_index:end]:
             if self.use_stemming:
                 value = stem_token(t, self.language)
             else:
                 value = normalize_token(t)
             words.append(value if value in self.gazetteer else "rare_word")
         return get_default_sep(self.language).join(words)
     return None
Exemplo n.º 3
0
 def _transform(self, tokens):
     if self.use_stemming:
         light_tokens = (stem_token(t, self.language) for t in tokens)
     else:
         light_tokens = (normalize_token(t) for t in tokens)
     current_index = 0
     transformed_tokens = []
     for light_token in light_tokens:
         transformed_token = Token(value=light_token,
                                   start=current_index,
                                   end=current_index + len(light_token))
         transformed_tokens.append(transformed_token)
         current_index = transformed_token.end + 1
     return transformed_tokens
Exemplo n.º 4
0
 def compute_feature(self, tokens, token_index):
     if self.use_stemming:
         value = stem_token(tokens[token_index], self.resources)
     else:
         value = normalize_token(tokens[token_index])
     return self.cluster.get(value, None)
Exemplo n.º 5
0
 def _transform(self, token):
     if self.use_stemming:
         return stem_token(token, self.language)
     return normalize_token(token)