예제 #1
0
def generate_feature_extractor(docs: Iterable[Document],
                               props: dict,
                               char_padding_size: int = 0):
    token_feature_extractor, token_features_meta = generate_token_feature_extractor(
        docs, props, char_padding_size)
    types_mapping = _get_ne_type_to_label_mapping(
        docs, set(props.get("restricted_ne_types", set())))
    valid_ne_types = types_mapping.keys()
    valid_ent_types = set(chain.from_iterable(types_mapping.values()))
    types_converter = create_categorical_converter(valid_ent_types,
                                                   zero_padding=False)

    token_position_fe, tp_meta = generate_token_position_feature_extractor(
        props)

    attention_ent_meta, attention_converters = get_categorical_meta_converters(
        _get_features(props, valid_ne_types, "attention"))

    attention_meta = AttentionFeaturesMeta(tp_meta, attention_ent_meta)

    classifier_meta, classifier_converters = get_categorical_meta_converters(
        _get_features(props, valid_ne_types))

    feature_extractor = NETFeatureExtractor(token_feature_extractor,
                                            token_position_fe,
                                            attention_converters,
                                            classifier_converters,
                                            types_converter, types_mapping)
    # no encoder meta in this task
    metas = Metas(get_empty_basic_meta(), attention_meta, classifier_meta)

    return feature_extractor, metas, token_features_meta
예제 #2
0
파일: factory.py 프로젝트: wayne9qiu/derek
def generate_feature_extractor(
        docs: Iterable[Document], props: dict,
        shared_feature_extractor: SpansCommonFeatureExtractor):

    strategy = _get_sampling_strategy(props)
    arc_converter = create_categorical_converter(
        strategy.get_possible_arc_types(docs), zero_padding=False)

    token_position_fe, tp_meta = generate_token_position_feature_extractor(
        props)
    attention_token_meta = tp_meta.namespaced('head') + tp_meta.namespaced(
        'dep')

    attention_arc_meta, attention_converters = get_categorical_meta_converters(
        _get_arc_level_features(props, "attention"))

    attention_meta = AttentionFeaturesMeta(attention_token_meta,
                                           attention_arc_meta)

    classifier_meta, classifier_converters = get_categorical_meta_converters(
        _get_arc_level_features(props))

    feature_extractor = ParserFeatureExtractor(shared_feature_extractor,
                                               arc_converter,
                                               token_position_fe,
                                               attention_converters,
                                               classifier_converters, strategy)

    return feature_extractor, Metas(get_empty_basic_meta(), attention_meta,
                                    classifier_meta)
예제 #3
0
def generate_feature_extractor(
        docs: Iterable[Document], props: dict,
        shared_feature_extractor: SpansCommonFeatureExtractor):

    entities_types = collect_entities_types(docs)

    pair_filters = [
        DifferentEntitiesCandidateFilter(),
        InSameSentenceCandidateFilter(),
        MaxTokenDistanceCandidateFilter(props['max_candidate_distance'])
    ]

    if props.get("filter_intersecting", False):
        pair_filters.append(IntersectingCandidateFilter())

    rels = [_filter_doc_rels(doc, AndFilter(pair_filters)) for doc in docs]
    rel_arg_types = create_rel_arg_types(rels)
    pair_filters.append(RelArgTypesCandidateFilter(rel_arg_types))
    candidate_extractor = DefaultCandidateExtractionStrategy(
        DefaultPairExtractionStrategy(AndFilter(pair_filters)))

    rel_dict = create_rel_dict(rels)
    valid_ent_rel_types = collect_valid_rel_types(rels)

    entities_encoder_meta, entities_encoder_converters = get_categorical_meta_converters(
        _get_entities_encoder_features(props, entities_types))

    token_position_fe, tp_meta = generate_token_position_feature_extractor(
        props)
    attention_token_meta = tp_meta.namespaced('e1') + tp_meta.namespaced('e2')

    attention_rel_meta, attention_converters = get_categorical_meta_converters(
        _get_relation_level_features(props, rel_arg_types, entities_types,
                                     "attention"))

    attention_meta = AttentionFeaturesMeta(attention_token_meta,
                                           attention_rel_meta)

    classifier_meta, classifier_converters = get_categorical_meta_converters(
        _get_relation_level_features(props, rel_arg_types, entities_types))

    feature_extractor = NegativeSamplesFilteringFeatureExtractor(
        shared_feature_extractor,
        rel_dict,
        entities_encoder_converters,
        token_position_fe,
        attention_converters,
        classifier_converters,
        candidate_extractor,
        valid_ent_rel_types,
        negative_ratio=props.get("negative_samples_ratio", float("inf")))

    return feature_extractor, Metas(entities_encoder_meta, attention_meta,
                                    classifier_meta)
예제 #4
0
def generate_token_feature_extractor(docs: Iterable[Document],
                                     props: dict,
                                     char_padding_size: int = 0):
    we_converters_preprocessors, we_meta = _init_we_features(docs, props)

    morph_features = props.get('morph_feats_list', DEFAULT_FEATS_LIST)
    word_meta, word_converters = get_categorical_meta_converters(
        _init_word_level_features(docs, props, morph_features))

    gazetteer_meta, gazetteer_fes = generate_gazetteers_feature_extractors(
        props)
    word_meta += gazetteer_meta

    vectors_keys = props.get('vectors_keys', [])
    if len(vectors_keys) != len(set(vectors_keys)):
        raise Exception('"vectors_keys" should not contain duplicates')
    vectorized_features = _init_vectorized_features(docs, vectors_keys)

    if not vectorized_features and not we_converters_preprocessors:
        warn("Neither word embeddings nor vectorized features were specified")

    word_meta += BasicFeaturesMeta([], [], vectorized_features)

    char_level_features = _init_char_level_features(docs, props,
                                                    char_padding_size)
    # We assume that char features are only embedded features
    char_features, _ = init_categorical_features(char_level_features)

    return TokenFeatureExtractor(
        we_converters_preprocessors, word_converters, char_level_features, vectors_keys, gazetteer_fes), \
        TokenFeaturesMeta(we_meta, word_meta, CharsFeaturesMeta(char_features))
예제 #5
0
def generate_ne_feature_extractor(docs: Iterable[Document], props: dict):
    features = {}
    labelling_strategy = None

    if props.get("ne_emb_size", -1) >= 0:
        types = collect_entities_types(docs, extras=True)
        labelling_strategy = get_labelling_strategy(
            props.get("ne_labelling_strategy", "IO"))
        features['ne'] = {
            'converter':
            create_categorical_converter(
                labelling_strategy.get_possible_categories(types),
                has_oov=True)
        }
        if props["ne_emb_size"] != 0:
            features["ne"]['embedding_size'] = props["ne_emb_size"]

    meta, converters = get_categorical_meta_converters(features)
    return NEFeatureExtractor(converters, labelling_strategy), meta
def generate_gazetteers_feature_extractors(props: dict):
    features = {}
    gazetteer_feature_extractors = {}
    converter = create_categorical_converter({True, False},
                                             zero_padding=True,
                                             has_oov=False)

    for index, config in enumerate(props.get('gazetteers', [])):
        gazetteer_name = f"gazetteer_{index}"
        features[gazetteer_name] = {'converter': converter}
        if config.get('emb_size', -1) > 0:
            features[gazetteer_name]['embedding_size'] = config['emb_size']

        gazetteer = _read_gazetteer(config["path"])
        processor = StandardTokenProcessor.from_props(config)
        gazetteer = set(map(processor, gazetteer))

        gazetteer_feature_extractors[
            gazetteer_name] = GazetteerFeatureExtractor(
                gazetteer, processor, converter,
                config.get("lemmatize", False))

    meta, _ = get_categorical_meta_converters(features)
    return meta, gazetteer_feature_extractors
def generate_token_position_feature_extractor(props):
    meta, converters = get_categorical_meta_converters(_get_features(props))
    return TokenPositionFeatureExtractor(converters), meta