示例#1
0
def generate_token_feature_extractor(docs: Iterable[Document],
                                     props: dict,
                                     char_padding_size: int = 0):
    we_converters_preprocessors, we_meta = _init_we_features(docs, props)

    morph_features = props.get('morph_feats_list', DEFAULT_FEATS_LIST)
    word_meta, word_converters = get_categorical_meta_converters(
        _init_word_level_features(docs, props, morph_features))

    gazetteer_meta, gazetteer_fes = generate_gazetteers_feature_extractors(
        props)
    word_meta += gazetteer_meta

    vectors_keys = props.get('vectors_keys', [])
    if len(vectors_keys) != len(set(vectors_keys)):
        raise Exception('"vectors_keys" should not contain duplicates')
    vectorized_features = _init_vectorized_features(docs, vectors_keys)

    if not vectorized_features and not we_converters_preprocessors:
        warn("Neither word embeddings nor vectorized features were specified")

    word_meta += BasicFeaturesMeta([], [], vectorized_features)

    char_level_features = _init_char_level_features(docs, props,
                                                    char_padding_size)
    # We assume that char features are only embedded features
    char_features, _ = init_categorical_features(char_level_features)

    return TokenFeatureExtractor(
        we_converters_preprocessors, word_converters, char_level_features, vectors_keys, gazetteer_fes), \
        TokenFeaturesMeta(we_meta, word_meta, CharsFeaturesMeta(char_features))
示例#2
0
def _collect_categorical_features(features_meta: BasicFeaturesMeta, placeholders, input_dims):
    embedded_features = []

    for feature in features_meta.get_embedded_features():
        placeholder = tf.placeholder(tf.int32, shape=[None]*input_dims, name=feature['name'] + '_placeholder')
        emb = create_embedding_lookup(placeholder, feature)
        embedded_features.append(emb)
        placeholders[feature['name']] = placeholder

    one_hot_features = []

    for feature in features_meta.get_one_hot_features():
        placeholder = tf.placeholder(tf.int32, shape=[None]*input_dims, name=feature['name'] + '_placeholder')
        one_hot = tf.one_hot(placeholder, feature['size'])
        one_hot_features.append(one_hot)
        placeholders[feature['name']] = placeholder

    return embedded_features, one_hot_features
示例#3
0
def _collect_vectorized_features(features_meta: BasicFeaturesMeta, placeholders, input_dims):
    vectorized_features = []

    for feature in features_meta.get_vectorized_features():
        placeholder = tf.placeholder(tf.float32, shape=[None]*input_dims + [feature['size']],
                                     name=feature['name'] + '_placeholder')
        vectorized_features.append(placeholder)
        placeholders[feature['name']] = placeholder

    return vectorized_features
def create_entity_feature_extractor(docs: Iterable[Document], props: dict,
                                    shared_feature_extractor):
    continuous_converters = _init_continuous_converters(props, docs)
    vectorized_features = _get_vectorized_features(continuous_converters)

    encoder_config = _get_encoder_features(props, docs)
    encoder_embeded_features, encoder_one_hot_features = init_categorical_features(
        encoder_config)
    encoder_features_meta = BasicFeaturesMeta(encoder_embeded_features,
                                              encoder_one_hot_features,
                                              vectorized_features)
    encoder_features_convetrets = get_converters_from_features_config(
        encoder_config)

    return EntityFeatureExtractor(shared_feature_extractor,
                                  encoder_features_convetrets,
                                  continuous_converters,
                                  props.get('morph_feats_list', []),
                                  props.get('speech_types', []),
                                  props.get('identity_features',
                                            [])), encoder_features_meta
示例#5
0
def create_coref_feature_extractor(docs: Iterable[Document], props: dict,
                                   shared_feature_extractor):
    rel_dict = create_rel_dict([doc.relations for doc in docs])
    sampling_strategy = _get_coref_sampling_startegy(props)

    classifier_config = _get_classifier_features(props, docs)
    classifier_features_meta = BasicFeaturesMeta(
        *init_categorical_features(classifier_config))
    classifier_features_converters = get_converters_from_features_config(
        classifier_config)

    entity_fe, encoder_features_meta = create_entity_feature_extractor(
        docs, props, shared_feature_extractor)

    coref_feature_extrator = CorefFeatureExtractor(
        entity_fe, sampling_strategy, rel_dict, classifier_features_converters,
        props.get('agreement_types', []))

    return coref_feature_extrator, Metas(encoder_features_meta,
                                         get_empty_attention_meta(),
                                         classifier_features_meta)
示例#6
0
def get_categorical_meta_converters(features_config):
    return BasicFeaturesMeta(*init_categorical_features(features_config)),\
        get_converters_from_features_config(features_config)