def generate_token_feature_extractor(docs: Iterable[Document], props: dict, char_padding_size: int = 0): we_converters_preprocessors, we_meta = _init_we_features(docs, props) morph_features = props.get('morph_feats_list', DEFAULT_FEATS_LIST) word_meta, word_converters = get_categorical_meta_converters( _init_word_level_features(docs, props, morph_features)) gazetteer_meta, gazetteer_fes = generate_gazetteers_feature_extractors( props) word_meta += gazetteer_meta vectors_keys = props.get('vectors_keys', []) if len(vectors_keys) != len(set(vectors_keys)): raise Exception('"vectors_keys" should not contain duplicates') vectorized_features = _init_vectorized_features(docs, vectors_keys) if not vectorized_features and not we_converters_preprocessors: warn("Neither word embeddings nor vectorized features were specified") word_meta += BasicFeaturesMeta([], [], vectorized_features) char_level_features = _init_char_level_features(docs, props, char_padding_size) # We assume that char features are only embedded features char_features, _ = init_categorical_features(char_level_features) return TokenFeatureExtractor( we_converters_preprocessors, word_converters, char_level_features, vectors_keys, gazetteer_fes), \ TokenFeaturesMeta(we_meta, word_meta, CharsFeaturesMeta(char_features))
def _collect_categorical_features(features_meta: BasicFeaturesMeta, placeholders, input_dims): embedded_features = [] for feature in features_meta.get_embedded_features(): placeholder = tf.placeholder(tf.int32, shape=[None]*input_dims, name=feature['name'] + '_placeholder') emb = create_embedding_lookup(placeholder, feature) embedded_features.append(emb) placeholders[feature['name']] = placeholder one_hot_features = [] for feature in features_meta.get_one_hot_features(): placeholder = tf.placeholder(tf.int32, shape=[None]*input_dims, name=feature['name'] + '_placeholder') one_hot = tf.one_hot(placeholder, feature['size']) one_hot_features.append(one_hot) placeholders[feature['name']] = placeholder return embedded_features, one_hot_features
def _collect_vectorized_features(features_meta: BasicFeaturesMeta, placeholders, input_dims): vectorized_features = [] for feature in features_meta.get_vectorized_features(): placeholder = tf.placeholder(tf.float32, shape=[None]*input_dims + [feature['size']], name=feature['name'] + '_placeholder') vectorized_features.append(placeholder) placeholders[feature['name']] = placeholder return vectorized_features
def create_entity_feature_extractor(docs: Iterable[Document], props: dict, shared_feature_extractor): continuous_converters = _init_continuous_converters(props, docs) vectorized_features = _get_vectorized_features(continuous_converters) encoder_config = _get_encoder_features(props, docs) encoder_embeded_features, encoder_one_hot_features = init_categorical_features( encoder_config) encoder_features_meta = BasicFeaturesMeta(encoder_embeded_features, encoder_one_hot_features, vectorized_features) encoder_features_convetrets = get_converters_from_features_config( encoder_config) return EntityFeatureExtractor(shared_feature_extractor, encoder_features_convetrets, continuous_converters, props.get('morph_feats_list', []), props.get('speech_types', []), props.get('identity_features', [])), encoder_features_meta
def create_coref_feature_extractor(docs: Iterable[Document], props: dict, shared_feature_extractor): rel_dict = create_rel_dict([doc.relations for doc in docs]) sampling_strategy = _get_coref_sampling_startegy(props) classifier_config = _get_classifier_features(props, docs) classifier_features_meta = BasicFeaturesMeta( *init_categorical_features(classifier_config)) classifier_features_converters = get_converters_from_features_config( classifier_config) entity_fe, encoder_features_meta = create_entity_feature_extractor( docs, props, shared_feature_extractor) coref_feature_extrator = CorefFeatureExtractor( entity_fe, sampling_strategy, rel_dict, classifier_features_converters, props.get('agreement_types', [])) return coref_feature_extrator, Metas(encoder_features_meta, get_empty_attention_meta(), classifier_features_meta)
def get_categorical_meta_converters(features_config): return BasicFeaturesMeta(*init_categorical_features(features_config)),\ get_converters_from_features_config(features_config)