def test_get_all_ngrams(self): # Given tokens = ["this", "is", "a", "simple", "sentence"] # When ngrams = get_all_ngrams(tokens) # Then expected_ngrams = [ {NGRAM: 'this', TOKEN_INDEXES: [0]}, {NGRAM: 'this is', TOKEN_INDEXES: [0, 1]}, {NGRAM: 'this is a', TOKEN_INDEXES: [0, 1, 2]}, {NGRAM: 'this is a simple', TOKEN_INDEXES: [0, 1, 2, 3]}, {NGRAM: 'this is a simple sentence', TOKEN_INDEXES: [0, 1, 2, 3, 4]}, {NGRAM: 'is', TOKEN_INDEXES: [1]}, {NGRAM: 'is a', TOKEN_INDEXES: [1, 2]}, {NGRAM: 'is a simple', TOKEN_INDEXES: [1, 2, 3]}, {NGRAM: 'is a simple sentence', TOKEN_INDEXES: [1, 2, 3, 4]}, {NGRAM: 'a', TOKEN_INDEXES: [2]}, {NGRAM: 'a simple', TOKEN_INDEXES: [2, 3]}, {NGRAM: 'a simple sentence', TOKEN_INDEXES: [2, 3, 4]}, {NGRAM: 'simple', TOKEN_INDEXES: [3]}, {NGRAM: 'simple sentence', TOKEN_INDEXES: [3, 4]}, {NGRAM: 'sentence', TOKEN_INDEXES: [4]} ] self.assertListEqual(expected_ngrams, ngrams)
def _get_dataset_entities_features(normalized_stemmed_tokens, entity_utterances_to_entity_names): ngrams = get_all_ngrams(normalized_stemmed_tokens) entity_features = [] for ngram in ngrams: entity_features += entity_utterances_to_entity_names.get( ngram[NGRAM], []) return entity_features
def test_get_all_ngrams(self): # Given tokens = ["this", "is", "a", "simple", "sentence"] # When ngrams = get_all_ngrams(tokens) # Then expected_ngrams = [{ NGRAM: 'this', TOKEN_INDEXES: [0] }, { NGRAM: 'this is', TOKEN_INDEXES: [0, 1] }, { NGRAM: 'this is a', TOKEN_INDEXES: [0, 1, 2] }, { NGRAM: 'this is a simple', TOKEN_INDEXES: [0, 1, 2, 3] }, { NGRAM: 'this is a simple sentence', TOKEN_INDEXES: [0, 1, 2, 3, 4] }, { NGRAM: 'is', TOKEN_INDEXES: [1] }, { NGRAM: 'is a', TOKEN_INDEXES: [1, 2] }, { NGRAM: 'is a simple', TOKEN_INDEXES: [1, 2, 3] }, { NGRAM: 'is a simple sentence', TOKEN_INDEXES: [1, 2, 3, 4] }, { NGRAM: 'a', TOKEN_INDEXES: [2] }, { NGRAM: 'a simple', TOKEN_INDEXES: [2, 3] }, { NGRAM: 'a simple sentence', TOKEN_INDEXES: [2, 3, 4] }, { NGRAM: 'simple', TOKEN_INDEXES: [3] }, { NGRAM: 'simple sentence', TOKEN_INDEXES: [3, 4] }, { NGRAM: 'sentence', TOKEN_INDEXES: [4] }] self.assertListEqual(expected_ngrams, ngrams)
def _get_word_cluster_features(query_tokens, clusters_name, language): if not clusters_name: return [] ngrams = get_all_ngrams(query_tokens) cluster_features = [] for ngram in ngrams: cluster = get_word_cluster(language, clusters_name).get( ngram[NGRAM].lower(), None) if cluster is not None: cluster_features.append(cluster) return cluster_features
def _get_word_cluster_features(query_tokens, clusters_name, language): if not clusters_name: return [] ngrams = get_all_ngrams(query_tokens) cluster_features = [] for ngram in ngrams: cluster = get_word_clusters(language)[clusters_name].get( ngram[NGRAM].lower(), None) if cluster is not None: cluster_features.append(cluster) return cluster_features
def _get_word_cluster_features(query_tokens, language): cluster_name = CLUSTER_USED_PER_LANGUAGES.get(language, False) if not cluster_name: return [] ngrams = get_all_ngrams(query_tokens) cluster_features = [] for ngram in ngrams: cluster = get_word_clusters(language)[cluster_name].get( ngram[NGRAM].lower(), None) if cluster is not None: cluster_features.append(cluster) return cluster_features
def collection_match(tokens, token_index): normalized_tokens = list(map(self._transform, tokens)) ngrams = get_all_ngrams(normalized_tokens) ngrams = [ngram for ngram in ngrams if token_index in ngram[TOKEN_INDEXES]] ngrams = sorted(ngrams, key=lambda ng: len(ng[TOKEN_INDEXES]), reverse=True) for ngram in ngrams: if ngram[NGRAM] in collection_set: return get_scheme_prefix(token_index, sorted(ngram[TOKEN_INDEXES]), self.tagging_scheme) return None