示例#1
0
 def compute_feature(self, tokens, token_index):
     max_len = len(tokens)
     end = token_index + self.n
     if 0 <= token_index < max_len and end <= max_len:
         return get_default_sep(self.language).join(
             get_shape(t.value) for t in tokens[token_index:end])
     return None
示例#2
0
 def compute_feature(self, tokens, token_index):
     max_len = len(tokens)
     end = token_index + self.n
     if 0 <= token_index < max_len and end <= max_len:
         return get_default_sep(self.language).join(
             get_shape(t.value) for t in tokens[token_index:end])
     return None
示例#3
0
    def _enrich_utterance(self, utterance, builtin_entities, custom_entities,
                          word_clusters):
        custom_entities_features = [
            _entity_name_to_feature(e[ENTITY_KIND], self.language)
            for e in custom_entities
        ]

        builtin_entities_features = [
            _builtin_entity_to_feature(ent[ENTITY_KIND], self.language)
            for ent in builtin_entities
        ]

        # We remove values of builtin slots from the utterance to avoid
        # learning specific samples such as '42' or 'tomorrow'
        filtered_tokens = [
            chunk[TEXT] for chunk in utterance[DATA]
            if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])
        ]

        features = get_default_sep(self.language).join(filtered_tokens)

        if builtin_entities_features:
            features += " " + " ".join(sorted(builtin_entities_features))
        if custom_entities_features:
            features += " " + " ".join(sorted(custom_entities_features))
        if word_clusters:
            features += " " + " ".join(sorted(word_clusters))

        return features
示例#4
0
 def compute_feature(self, tokens, token_index):
     max_len = len(tokens)
     end = token_index + self.n
     if 0 <= token_index < max_len and end <= max_len:
         if self.gazetteer is None:
             if self.use_stemming:
                 return get_default_sep(self.language).join(
                     t.stem for t in tokens[token_index:end])
             return get_default_sep(self.language).join(
                 t.normalized_value for t in tokens[token_index:end])
         words = []
         for t in tokens[token_index:end]:
             normalized = t.stem if self.use_stemming else \
                 t.normalized_value
             words.append(normalized if normalized in self.gazetteer
                          else "rare_word")
         return get_default_sep(self.language).join(words)
     return None
示例#5
0
 def compute_feature(self, tokens, token_index):
     max_len = len(tokens)
     end = token_index + self.n
     if 0 <= token_index < max_len and end <= max_len:
         if self.gazetteer is None:
             if self.use_stemming:
                 return get_default_sep(self.language).join(
                     t.stem for t in tokens[token_index:end])
             return get_default_sep(self.language).join(
                 t.normalized_value for t in tokens[token_index:end])
         words = []
         for t in tokens[token_index:end]:
             normalized = t.stem if self.use_stemming else \
                 t.normalized_value
             words.append(normalized if normalized in
                          self.gazetteer else "rare_word")
         return get_default_sep(self.language).join(words)
     return None
示例#6
0
 def compute_feature(self, tokens, token_index):
     max_len = len(tokens)
     end = token_index + self.n
     if 0 <= token_index < max_len and end <= max_len:
         if self.gazetteer is None:
             if self.use_stemming:
                 stems = (stem_token(t, self.language)
                          for t in tokens[token_index:end])
                 return get_default_sep(self.language).join(stems)
             normalized_values = (normalize_token(t)
                                  for t in tokens[token_index:end])
             return get_default_sep(self.language).join(normalized_values)
         words = []
         for t in tokens[token_index:end]:
             if self.use_stemming:
                 value = stem_token(t, self.language)
             else:
                 value = normalize_token(t)
             words.append(value if value in self.gazetteer else "rare_word")
         return get_default_sep(self.language).join(words)
     return None
示例#7
0
def _preprocess_query(query, language, entity_utterances_to_features_names):
    query_tokens = tokenize_light(query, language)
    word_clusters_features = _get_word_cluster_features(query_tokens, language)
    normalized_stemmed_tokens = [_normalize_stem(t, language)
                                 for t in query_tokens]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    features = get_default_sep(language).join(normalized_stemmed_tokens)
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))
    return features
示例#8
0
def _preprocess_query(query, language, entity_utterances_to_features_names):
    query_tokens = tokenize_light(query, language)
    word_clusters_features = _get_word_cluster_features(query_tokens, language)
    normalized_stemmed_tokens = [
        _normalize_stem(t, language) for t in query_tokens
    ]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    features = get_default_sep(language).join(normalized_stemmed_tokens)
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))
    return features
示例#9
0
def _load_gazetteers(gazetteers_path, language):
    if not gazetteers_path.is_dir():
        return dict()

    gazetteers = dict()
    for filepath in gazetteers_path.iterdir():
        gazetteer_name = filepath.stem
        with filepath.open(encoding="utf8") as f:
            gazetteers[gazetteer_name] = set()
            for line in f:
                normalized = normalize(line.strip())
                if normalized:
                    token_values = (t.value
                                    for t in tokenize(normalized, language))
                    normalized = get_default_sep(language).join(token_values)
                    gazetteers[gazetteer_name].add(normalized)
    return gazetteers
示例#10
0
def _load_gazetteers(language):
    gazetteers_paths = {
        os.path.splitext(name)[0]: os.path.join(get_resources_path(language),
                                                name)
        for name in RESOURCE_INDEX[language].get(GAZETTEERS, [])
    }
    gazetteers = dict()
    for name, path in iteritems(gazetteers_paths):
        with io.open(path, encoding="utf8") as f:
            gazetteers[name] = set()
            for l in f:
                normalized = normalize(l.strip())
                if normalized:
                    token_values = (t.value
                                    for t in tokenize(normalized, language))
                    normalized = get_default_sep(language).join(token_values)
                    gazetteers[name].add(normalized)
    return gazetteers
示例#11
0
def _preprocess_utterance(utterance, language, builtin_entity_parser,
                          custom_entity_parser, word_clusters_name,
                          use_stemming, unknownword_replacement_string):
    utterance_text = get_text_from_chunks(utterance[DATA])
    utterance_tokens = tokenize_light(utterance_text, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [_normalize_stem(t, language, use_stemming)
                                 for t in utterance_tokens]

    custom_entities = custom_entity_parser.parse(
        " ".join(normalized_stemmed_tokens))
    custom_entities = [e for e in custom_entities
                       if e["value"] != unknownword_replacement_string]
    custom_entities_features = [
        _entity_name_to_feature(e[ENTITY_KIND], language)
        for e in custom_entities]

    builtin_entities = builtin_entity_parser.parse(
        utterance_text, use_cache=True)
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove values of builtin slots from the utterance to avoid learning
    # specific samples such as '42' or 'tomorrow'
    filtered_normalized_stemmed_tokens = [
        _normalize_stem(chunk[TEXT], language, use_stemming)
        for chunk in utterance[DATA]
        if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])
    ]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if custom_entities_features:
        features += " " + " ".join(sorted(custom_entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features
示例#12
0
def get_string_variations(string, language):
    variations = {string}
    variations.update(flatten(case_variations(v) for v in variations))
    variations.update(flatten(normalization_variations(v) for v in variations))
    # We re-generate case variations as normalization can produce new
    # variations
    variations.update(flatten(case_variations(v) for v in variations))
    variations.update(flatten(and_variations(v, language) for v in variations))
    variations.update(
        flatten(punctuation_variations(v, language) for v in variations))
    variations.update(
        flatten(numbers_variations(v, language) for v in variations))
    # Add single space variations
    single_space_variations = set(" ".join(v.split()) for v in variations)
    variations.update(single_space_variations)
    # Add tokenized variations
    tokenized_variations = set(
        get_default_sep(language).join(tokenize_light(v, language)) for v in
        variations)
    variations.update(tokenized_variations)
    return variations
示例#13
0
def get_string_variations(string, language):
    variations = {string}
    variations.update(flatten(case_variations(v) for v in variations))
    variations.update(flatten(normalization_variations(v) for v in variations))
    # We re-generate case variations as normalization can produce new
    # variations
    variations.update(flatten(case_variations(v) for v in variations))
    variations.update(flatten(and_variations(v, language) for v in variations))
    variations.update(
        flatten(punctuation_variations(v, language) for v in variations))
    variations.update(
        flatten(numbers_variations(v, language) for v in variations))
    # Add single space variations
    single_space_variations = set(" ".join(v.split()) for v in variations)
    variations.update(single_space_variations)
    # Add tokenized variations
    tokenized_variations = set(
        get_default_sep(language).join(tokenize_light(v, language))
        for v in variations)
    variations.update(tokenized_variations)
    return variations
示例#14
0
def _preprocess_utterance(utterance, language,
                          entity_utterances_to_features_names,
                          word_clusters_name):
    utterance_tokens = tokenize_light(utterance, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [_normalize_stem(t, language)
                                 for t in utterance_tokens]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    builtin_entities = get_builtin_entities(utterance, language)
    entities_ranges = (
        e[RES_MATCH_RANGE] for e in
        sorted(builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START])
    )
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove builtin entities from the utterance to avoid learning specific
    # examples such as '42'
    filtered_utterance = _remove_ranges(utterance, entities_ranges)
    filtered_utterance_tokens = tokenize_light(filtered_utterance, language)
    filtered_normalized_stemmed_tokens = [_normalize_stem(t, language)
                                          for t in filtered_utterance_tokens]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features
示例#15
0
def _preprocess_utterance(utterance, language,
                          entity_utterances_to_features_names,
                          word_clusters_name):
    utterance_tokens = tokenize_light(utterance, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [
        _normalize_stem(t, language) for t in utterance_tokens
    ]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    builtin_entities = get_builtin_entities(utterance, language)
    entities_ranges = (e[RES_MATCH_RANGE] for e in sorted(
        builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START]))
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove builtin entities from the utterance to avoid learning specific
    # examples such as '42'
    filtered_utterance = _remove_ranges(utterance, entities_ranges)
    filtered_utterance_tokens = tokenize_light(filtered_utterance, language)
    filtered_normalized_stemmed_tokens = [
        _normalize_stem(t, language) for t in filtered_utterance_tokens
    ]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features
示例#16
0
def _preprocess_utterance(utterance, language,
                          entity_utterances_to_features_names,
                          word_clusters_name):
    utterance_text = get_text_from_chunks(utterance[DATA])
    utterance_tokens = tokenize_light(utterance_text, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [
        _normalize_stem(t, language) for t in utterance_tokens
    ]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    builtin_entities = get_builtin_entities(utterance_text,
                                            language,
                                            use_cache=True)
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove values of builtin slots from the utterance to avoid learning
    # specific samples such as '42' or 'tomorrow'
    filtered_normalized_stemmed_tokens = [
        _normalize_stem(chunk[TEXT], language) for chunk in utterance[DATA]
        if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])
    ]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features
示例#17
0
 def stem_function(text, language):
     return get_default_sep(language).join(
         [_stem(t) for t in tokenize_light(text, language)])
示例#18
0
def capitalize(text, language):
    tokens = tokenize_light(text, language)
    stop_words = get_stop_words(language)
    return get_default_sep(language).join(
        t.title() if t.lower() not in stop_words
        else t.lower() for t in tokens)
    variations.update(flatten(normalization_variations(v) for v in variations))
    # We re-generate case variations as normalization can produce new
    # variations
    if case:
        variations.update(flatten(case_variations(v) for v in variations))
    if and_:
        variations.update(
            flatten(and_variations(v, language) for v in variations))
    if punctuation:
        variations.update(
            flatten(punctuation_variations(v, language) for v in variations))

    # Special case of number variation which are long to generate due to the
    # BuilinEntityParser running on each variation
    if numbers:
        variations.update(
            flatten(
                numbers_variations(v, language, builtin_entity_parser)
                for v in variations))

    # Add single space variations
    single_space_variations = set(" ".join(v.split()) for v in variations)
    variations.update(single_space_variations)
    # Add tokenized variations
    tokenized_variations = set(
        get_default_sep(language).join(tokenize_light(v, language))
        for v in variations)
    variations.update(tokenized_variations)
    return variations
示例#20
0
def capitalize(text, language, resources):
    tokens = tokenize_light(text, language)
    stop_words = get_stop_words(resources)
    return get_default_sep(language).join(
        t.title() if t.lower() not in stop_words else t.lower()
        for t in tokens)