def generate_feature_extractor(docs: Iterable[Document], props: dict, char_padding_size: int = 0): token_feature_extractor, token_features_meta = generate_token_feature_extractor( docs, props, char_padding_size) types_mapping = _get_ne_type_to_label_mapping( docs, set(props.get("restricted_ne_types", set()))) valid_ne_types = types_mapping.keys() valid_ent_types = set(chain.from_iterable(types_mapping.values())) types_converter = create_categorical_converter(valid_ent_types, zero_padding=False) token_position_fe, tp_meta = generate_token_position_feature_extractor( props) attention_ent_meta, attention_converters = get_categorical_meta_converters( _get_features(props, valid_ne_types, "attention")) attention_meta = AttentionFeaturesMeta(tp_meta, attention_ent_meta) classifier_meta, classifier_converters = get_categorical_meta_converters( _get_features(props, valid_ne_types)) feature_extractor = NETFeatureExtractor(token_feature_extractor, token_position_fe, attention_converters, classifier_converters, types_converter, types_mapping) # no encoder meta in this task metas = Metas(get_empty_basic_meta(), attention_meta, classifier_meta) return feature_extractor, metas, token_features_meta
def generate_feature_extractor( docs: Iterable[Document], props: dict, shared_feature_extractor: SpansCommonFeatureExtractor): strategy = _get_sampling_strategy(props) arc_converter = create_categorical_converter( strategy.get_possible_arc_types(docs), zero_padding=False) token_position_fe, tp_meta = generate_token_position_feature_extractor( props) attention_token_meta = tp_meta.namespaced('head') + tp_meta.namespaced( 'dep') attention_arc_meta, attention_converters = get_categorical_meta_converters( _get_arc_level_features(props, "attention")) attention_meta = AttentionFeaturesMeta(attention_token_meta, attention_arc_meta) classifier_meta, classifier_converters = get_categorical_meta_converters( _get_arc_level_features(props)) feature_extractor = ParserFeatureExtractor(shared_feature_extractor, arc_converter, token_position_fe, attention_converters, classifier_converters, strategy) return feature_extractor, Metas(get_empty_basic_meta(), attention_meta, classifier_meta)
def generate_feature_extractor( docs: Iterable[Document], props: dict, shared_feature_extractor: SpansCommonFeatureExtractor): entities_types = collect_entities_types(docs) pair_filters = [ DifferentEntitiesCandidateFilter(), InSameSentenceCandidateFilter(), MaxTokenDistanceCandidateFilter(props['max_candidate_distance']) ] if props.get("filter_intersecting", False): pair_filters.append(IntersectingCandidateFilter()) rels = [_filter_doc_rels(doc, AndFilter(pair_filters)) for doc in docs] rel_arg_types = create_rel_arg_types(rels) pair_filters.append(RelArgTypesCandidateFilter(rel_arg_types)) candidate_extractor = DefaultCandidateExtractionStrategy( DefaultPairExtractionStrategy(AndFilter(pair_filters))) rel_dict = create_rel_dict(rels) valid_ent_rel_types = collect_valid_rel_types(rels) entities_encoder_meta, entities_encoder_converters = get_categorical_meta_converters( _get_entities_encoder_features(props, entities_types)) token_position_fe, tp_meta = generate_token_position_feature_extractor( props) attention_token_meta = tp_meta.namespaced('e1') + tp_meta.namespaced('e2') attention_rel_meta, attention_converters = get_categorical_meta_converters( _get_relation_level_features(props, rel_arg_types, entities_types, "attention")) attention_meta = AttentionFeaturesMeta(attention_token_meta, attention_rel_meta) classifier_meta, classifier_converters = get_categorical_meta_converters( _get_relation_level_features(props, rel_arg_types, entities_types)) feature_extractor = NegativeSamplesFilteringFeatureExtractor( shared_feature_extractor, rel_dict, entities_encoder_converters, token_position_fe, attention_converters, classifier_converters, candidate_extractor, valid_ent_rel_types, negative_ratio=props.get("negative_samples_ratio", float("inf"))) return feature_extractor, Metas(entities_encoder_meta, attention_meta, classifier_meta)
def generate_token_feature_extractor(docs: Iterable[Document], props: dict, char_padding_size: int = 0): we_converters_preprocessors, we_meta = _init_we_features(docs, props) morph_features = props.get('morph_feats_list', DEFAULT_FEATS_LIST) word_meta, word_converters = get_categorical_meta_converters( _init_word_level_features(docs, props, morph_features)) gazetteer_meta, gazetteer_fes = generate_gazetteers_feature_extractors( props) word_meta += gazetteer_meta vectors_keys = props.get('vectors_keys', []) if len(vectors_keys) != len(set(vectors_keys)): raise Exception('"vectors_keys" should not contain duplicates') vectorized_features = _init_vectorized_features(docs, vectors_keys) if not vectorized_features and not we_converters_preprocessors: warn("Neither word embeddings nor vectorized features were specified") word_meta += BasicFeaturesMeta([], [], vectorized_features) char_level_features = _init_char_level_features(docs, props, char_padding_size) # We assume that char features are only embedded features char_features, _ = init_categorical_features(char_level_features) return TokenFeatureExtractor( we_converters_preprocessors, word_converters, char_level_features, vectors_keys, gazetteer_fes), \ TokenFeaturesMeta(we_meta, word_meta, CharsFeaturesMeta(char_features))
def generate_ne_feature_extractor(docs: Iterable[Document], props: dict): features = {} labelling_strategy = None if props.get("ne_emb_size", -1) >= 0: types = collect_entities_types(docs, extras=True) labelling_strategy = get_labelling_strategy( props.get("ne_labelling_strategy", "IO")) features['ne'] = { 'converter': create_categorical_converter( labelling_strategy.get_possible_categories(types), has_oov=True) } if props["ne_emb_size"] != 0: features["ne"]['embedding_size'] = props["ne_emb_size"] meta, converters = get_categorical_meta_converters(features) return NEFeatureExtractor(converters, labelling_strategy), meta
def generate_gazetteers_feature_extractors(props: dict): features = {} gazetteer_feature_extractors = {} converter = create_categorical_converter({True, False}, zero_padding=True, has_oov=False) for index, config in enumerate(props.get('gazetteers', [])): gazetteer_name = f"gazetteer_{index}" features[gazetteer_name] = {'converter': converter} if config.get('emb_size', -1) > 0: features[gazetteer_name]['embedding_size'] = config['emb_size'] gazetteer = _read_gazetteer(config["path"]) processor = StandardTokenProcessor.from_props(config) gazetteer = set(map(processor, gazetteer)) gazetteer_feature_extractors[ gazetteer_name] = GazetteerFeatureExtractor( gazetteer, processor, converter, config.get("lemmatize", False)) meta, _ = get_categorical_meta_converters(features) return meta, gazetteer_feature_extractors
def generate_token_position_feature_extractor(props): meta, converters = get_categorical_meta_converters(_get_features(props)) return TokenPositionFeatureExtractor(converters), meta