def test_get_all_ngrams(self):
        # Given
        tokens = ["this", "is", "a", "simple", "sentence"]

        # When
        ngrams = get_all_ngrams(tokens)

        # Then
        expected_ngrams = [
            {NGRAM: 'this', TOKEN_INDEXES: [0]},
            {NGRAM: 'this is', TOKEN_INDEXES: [0, 1]},
            {NGRAM: 'this is a', TOKEN_INDEXES: [0, 1, 2]},
            {NGRAM: 'this is a simple', TOKEN_INDEXES: [0, 1, 2, 3]},
            {NGRAM: 'this is a simple sentence',
             TOKEN_INDEXES: [0, 1, 2, 3, 4]},
            {NGRAM: 'is', TOKEN_INDEXES: [1]},
            {NGRAM: 'is a', TOKEN_INDEXES: [1, 2]},
            {NGRAM: 'is a simple', TOKEN_INDEXES: [1, 2, 3]},
            {NGRAM: 'is a simple sentence', TOKEN_INDEXES: [1, 2, 3, 4]},
            {NGRAM: 'a', TOKEN_INDEXES: [2]},
            {NGRAM: 'a simple', TOKEN_INDEXES: [2, 3]},
            {NGRAM: 'a simple sentence', TOKEN_INDEXES: [2, 3, 4]},
            {NGRAM: 'simple', TOKEN_INDEXES: [3]},
            {NGRAM: 'simple sentence', TOKEN_INDEXES: [3, 4]},
            {NGRAM: 'sentence', TOKEN_INDEXES: [4]}
        ]

        self.assertListEqual(expected_ngrams, ngrams)
Пример #2
0
def _get_dataset_entities_features(normalized_stemmed_tokens,
                                   entity_utterances_to_entity_names):
    ngrams = get_all_ngrams(normalized_stemmed_tokens)
    entity_features = []
    for ngram in ngrams:
        entity_features += entity_utterances_to_entity_names.get(
            ngram[NGRAM], [])
    return entity_features
Пример #3
0
def _get_dataset_entities_features(normalized_stemmed_tokens,
                                   entity_utterances_to_entity_names):
    ngrams = get_all_ngrams(normalized_stemmed_tokens)
    entity_features = []
    for ngram in ngrams:
        entity_features += entity_utterances_to_entity_names.get(
            ngram[NGRAM], [])
    return entity_features
Пример #4
0
    def test_get_all_ngrams(self):
        # Given
        tokens = ["this", "is", "a", "simple", "sentence"]

        # When
        ngrams = get_all_ngrams(tokens)

        # Then
        expected_ngrams = [{
            NGRAM: 'this',
            TOKEN_INDEXES: [0]
        }, {
            NGRAM: 'this is',
            TOKEN_INDEXES: [0, 1]
        }, {
            NGRAM: 'this is a',
            TOKEN_INDEXES: [0, 1, 2]
        }, {
            NGRAM: 'this is a simple',
            TOKEN_INDEXES: [0, 1, 2, 3]
        }, {
            NGRAM: 'this is a simple sentence',
            TOKEN_INDEXES: [0, 1, 2, 3, 4]
        }, {
            NGRAM: 'is',
            TOKEN_INDEXES: [1]
        }, {
            NGRAM: 'is a',
            TOKEN_INDEXES: [1, 2]
        }, {
            NGRAM: 'is a simple',
            TOKEN_INDEXES: [1, 2, 3]
        }, {
            NGRAM: 'is a simple sentence',
            TOKEN_INDEXES: [1, 2, 3, 4]
        }, {
            NGRAM: 'a',
            TOKEN_INDEXES: [2]
        }, {
            NGRAM: 'a simple',
            TOKEN_INDEXES: [2, 3]
        }, {
            NGRAM: 'a simple sentence',
            TOKEN_INDEXES: [2, 3, 4]
        }, {
            NGRAM: 'simple',
            TOKEN_INDEXES: [3]
        }, {
            NGRAM: 'simple sentence',
            TOKEN_INDEXES: [3, 4]
        }, {
            NGRAM: 'sentence',
            TOKEN_INDEXES: [4]
        }]

        self.assertListEqual(expected_ngrams, ngrams)
Пример #5
0
def _get_word_cluster_features(query_tokens, clusters_name, language):
    if not clusters_name:
        return []
    ngrams = get_all_ngrams(query_tokens)
    cluster_features = []
    for ngram in ngrams:
        cluster = get_word_cluster(language, clusters_name).get(
            ngram[NGRAM].lower(), None)
        if cluster is not None:
            cluster_features.append(cluster)
    return cluster_features
Пример #6
0
def _get_word_cluster_features(query_tokens, clusters_name, language):
    if not clusters_name:
        return []
    ngrams = get_all_ngrams(query_tokens)
    cluster_features = []
    for ngram in ngrams:
        cluster = get_word_clusters(language)[clusters_name].get(
            ngram[NGRAM].lower(), None)
        if cluster is not None:
            cluster_features.append(cluster)
    return cluster_features
Пример #7
0
def _get_word_cluster_features(query_tokens, language):
    cluster_name = CLUSTER_USED_PER_LANGUAGES.get(language, False)
    if not cluster_name:
        return []
    ngrams = get_all_ngrams(query_tokens)
    cluster_features = []
    for ngram in ngrams:
        cluster = get_word_clusters(language)[cluster_name].get(
            ngram[NGRAM].lower(), None)
        if cluster is not None:
            cluster_features.append(cluster)
    return cluster_features
Пример #8
0
def _get_word_cluster_features(query_tokens, language):
    cluster_name = CLUSTER_USED_PER_LANGUAGES.get(language, False)
    if not cluster_name:
        return []
    ngrams = get_all_ngrams(query_tokens)
    cluster_features = []
    for ngram in ngrams:
        cluster = get_word_clusters(language)[cluster_name].get(
            ngram[NGRAM].lower(), None)
        if cluster is not None:
            cluster_features.append(cluster)
    return cluster_features
Пример #9
0
 def collection_match(tokens, token_index):
     normalized_tokens = list(map(self._transform, tokens))
     ngrams = get_all_ngrams(normalized_tokens)
     ngrams = [ngram for ngram in ngrams if
               token_index in ngram[TOKEN_INDEXES]]
     ngrams = sorted(ngrams, key=lambda ng: len(ng[TOKEN_INDEXES]),
                     reverse=True)
     for ngram in ngrams:
         if ngram[NGRAM] in collection_set:
             return get_scheme_prefix(token_index,
                                      sorted(ngram[TOKEN_INDEXES]),
                                      self.tagging_scheme)
     return None
Пример #10
0
 def collection_match(tokens, token_index):
     normalized_tokens = list(map(self._transform, tokens))
     ngrams = get_all_ngrams(normalized_tokens)
     ngrams = [ngram for ngram in ngrams if
               token_index in ngram[TOKEN_INDEXES]]
     ngrams = sorted(ngrams, key=lambda ng: len(ng[TOKEN_INDEXES]),
                     reverse=True)
     for ngram in ngrams:
         if ngram[NGRAM] in collection_set:
             return get_scheme_prefix(token_index,
                                      sorted(ngram[TOKEN_INDEXES]),
                                      self.tagging_scheme)
     return None