def compute_features(self, tokens, drop_out=False): """Compute features on the provided tokens The *drop_out* parameters allows to activate drop out on features that have a positive drop out ratio. This should only be used during training. """ if resource_exists(self.language, STEMS): tokens = [ Token(t.value, t.start, t.end, stem=stem(t.normalized_value, self.language)) for t in tokens] else: tokens = [Token(t.value, t.start, t.end, stem=t.normalized_value) for t in tokens] cache = [{TOKEN_NAME: token} for token in tokens] features = [] random_state = check_random_state(self.config.random_seed) for i in range(len(tokens)): token_features = UnupdatableDict() for feature in self.features: f_drop_out = feature.drop_out if drop_out and random_state.rand() < f_drop_out: continue value = feature.compute(i, cache) if value is not None: token_features[feature.name] = value features.append(token_features) return features
def test_spans_to_tokens_indexes(self): # Given spans = [{ START: 0, END: 1 }, { START: 2, END: 6 }, { START: 5, END: 6 }, { START: 9, END: 15 }] tokens = [ Token(value="abc", start=0, end=3, stem="abc"), Token(value="def", start=4, end=7, stem="def"), Token(value="ghi", start=10, end=13, stem="ghi") ] # When indexes = _spans_to_tokens_indexes(spans, tokens) # Then expected_indexes = [[0], [0, 1], [1], [2]] self.assertListEqual(indexes, expected_indexes)
def test_utterance_to_sample(self, mocked_positive_tagging): # Given language = LANGUAGE_EN def mock_positive_tagging(_, slot, slot_size): return [INSIDE_PREFIX + slot for _ in range(slot_size)] mocked_positive_tagging.side_effect = mock_positive_tagging slot_name = "animal" query_data = [{ "text": "i am a " }, { "text": "beautiful bird", "slot_name": slot_name }] expected_tagging = [ OUTSIDE, OUTSIDE, OUTSIDE, INSIDE_PREFIX + slot_name, INSIDE_PREFIX + slot_name ] expected_tokens = [ Token(value='i', start=0, end=1), Token(value='am', start=2, end=4), Token(value='a', start=5, end=6), Token(value='beautiful', start=7, end=16), Token(value='bird', start=17, end=21) ] expected_sample = {"tokens": expected_tokens, "tags": expected_tagging} # When sample = utterance_to_sample(query_data, TaggingScheme.IO, language) # Then self.assertEqual(sample, expected_sample)
def test_should_tokenize_symbols(self): # Given language = LANGUAGE_EN text = "$$ % !!" # When tokens = tokenize(text, language) # Then expected_tokens = [ Token(value='$$', start=0, end=2, stem=None), Token(value='%', start=3, end=4, stem=None), Token(value='!!', start=5, end=7, stem=None) ] self.assertListEqual(tokens, expected_tokens)
def test_should_tokenize_literals(self): # Given language = LANGUAGE_EN text = "Hello Beautiful World" # When tokens = tokenize(text, language) # Then expected_tokens = [ Token(value='Hello', start=0, end=5, stem=None), Token(value='Beautiful', start=6, end=15, stem=None), Token(value='World', start=16, end=21, stem=None) ] self.assertListEqual(tokens, expected_tokens)
def utterance_to_sample(query_data, tagging_scheme, language): tokens, tags = [], [] current_length = 0 for chunk in query_data: chunk_tokens = tokenize(chunk[TEXT], language) tokens += [Token(t.value, current_length + t.start, current_length + t.end) for t in chunk_tokens] current_length += len(chunk[TEXT]) if SLOT_NAME not in chunk: tags += negative_tagging(len(chunk_tokens)) else: tags += positive_tagging(tagging_scheme, chunk[SLOT_NAME], len(chunk_tokens)) return {TOKENS: tokens, TAGS: tags}