def _get_classifier_features(props, docs): classifier_features = {} dual_features = {} classifier_agreement_size = props.get('classifier_agreement_size', -1) if classifier_agreement_size >= 0: agreement_types = props.get('agreement_types', []) for agreement_type in agreement_types: converter = create_categorical_converter( {"agreement", "disagreement", "unknown"}) classifier_features.update( create_feature(agreement_type + "_agreement", props, converter, 'classifier_agreement')) classifier_features.update( create_feature( 'mention_distance', props, create_unsigned_integers_converter(props["max_mention_distance"]))) classifier_features.update( create_feature( 'mention_interrelation', props, create_categorical_converter( {"CONTAINS", "CONTAINED", "INTERSECTS", "SEPARATED"}))) classifier_features.update( create_feature( 'classifier_entity_distance', props, create_unsigned_integers_converter(props["max_entity_distance"]))) classifier_features.update( create_feature( 'entities_token_distance_in_classifier', props, create_unsigned_integers_converter( props["max_token_entities_distance"]))) classifier_features.update( create_feature( 'entities_sent_distance_in_classifier', props, create_unsigned_integers_converter( props["max_sent_entities_distance"]))) dual_features.update( create_feature( 'entities_types_in_classifier', props, create_categorical_converter(collect_entities_types(docs)))) dual_features.update( create_feature( 'head_ne_types', props, create_categorical_converter(collect_entities_types( docs, extras=True).union('O'), has_oov=True))) classifier_features.update(_get_binary_features(props)) return duplicate_features_config(dual_features, classifier_features)
def _init_we_features(docs: Iterable[Document], props: dict): we_converters_preprocessors = {} precomputed_features = [] i = 0 for model_config in props.get('models', []): reader_type = model_config.get("type", "w2v") ignore_errors = model_config.get("ignore_utf_errors", False) reader = embedding_readers[reader_type]( errors='ignore' if ignore_errors else 'strict') logger.info(f"Loading {reader_type} model...") name = f'words_{i}' we_model = reader.read(model_config["path"]) trainable = model_config.get("trainable", False) preprocessor = StandardTokenProcessor.from_props(model_config) tokens_set = extract_tokens(docs, preprocessor, we_model, trainable) converter = create_categorical_converter(tokens_set, has_oov=True) vectors = init_vectors(converter, we_model.vector_size, we_model, trainable) we_converters_preprocessors[name] = (converter, preprocessor) precomputed_features.append({ 'name': name, 'vectors': vectors, 'trainable': trainable }) logger.info("Initialised embeddings ({}, {})".format( vectors.shape[0], vectors.shape[1])) i += 1 if props.get("internal_emb_size", 0) != 0: name = f'words_{i}' doc_tokens = extract_tokens(docs, trainable=True) converter = create_categorical_converter(doc_tokens, has_oov=True) vectors = init_vectors(converter, props["internal_emb_size"], trainable=True) we_converters_preprocessors[name] = (converter, None) precomputed_features.append({ 'name': name, 'vectors': vectors, 'trainable': True }) return we_converters_preprocessors, WordEmbeddingsMeta( precomputed_features)
def _init_we_features(docs: Iterable[Document], props: dict): we_converters_preprocessors = {} precomputed_features = [] i = 0 for model in props.get('models', []): logger.info("Loading w2v model...") name = f'words_{i}' we_model = KeyedVectors.load_word2vec_format(model["path"], binary=model.get( "binary", True), datatype=float) trainable = model.get("trainable", False) preprocessor = StandardTokenProcessor.from_props(model) tokens_set = extract_tokens(docs, preprocessor, we_model, trainable) converter = create_categorical_converter(tokens_set, has_oov=True) vectors = init_vectors(converter, we_model.vector_size, we_model, trainable) we_converters_preprocessors[name] = (converter, preprocessor) precomputed_features.append({ 'name': name, 'vectors': vectors, 'trainable': trainable }) logger.info("Initialised embeddings ({}, {})".format( vectors.shape[0], vectors.shape[1])) i += 1 if props.get("internal_emb_size", 0) != 0: name = f'words_{i}' doc_tokens = extract_tokens(docs, trainable=True) converter = create_categorical_converter(doc_tokens, has_oov=True) vectors = init_vectors(converter, props["internal_emb_size"], trainable=True) we_converters_preprocessors[name] = (converter, None) precomputed_features.append({ 'name': name, 'vectors': vectors, 'trainable': True }) return we_converters_preprocessors, WordEmbeddingsMeta( precomputed_features)
def _get_entities_encoder_features(props, entities_types): oov_type = None entities_types = entities_types.union({oov_type}) features = {} if props.get("entities_types_emb_size", -1) >= 0: features['entities_types'] = { 'converter': create_categorical_converter(entities_types, has_oov=True, oov_object=oov_type) } if props["entities_types_emb_size"] != 0: features["entities_types"]['embedding_size'] = props[ "entities_types_emb_size"] if props.get("entities_depth_emb_size", -1) >= 0: features['entities_depths'] = { 'converter': create_unsigned_integers_converter(props["max_entities_depth"]) } if props["entities_depth_emb_size"] != 0: features["entities_depths"]['embedding_size'] = props[ "entities_depth_emb_size"] return features
def create_rel_dict(rels: list): rel_types = {None} for doc_rels in rels: rel_types.update([x.type for x in doc_rels]) return create_categorical_converter(rel_types, zero_padding=False)
def generate_feature_extractor(docs: Iterable[Document], props: dict, char_padding_size: int = 0): token_feature_extractor, token_features_meta = generate_token_feature_extractor( docs, props, char_padding_size) types_mapping = _get_ne_type_to_label_mapping( docs, set(props.get("restricted_ne_types", set()))) valid_ne_types = types_mapping.keys() valid_ent_types = set(chain.from_iterable(types_mapping.values())) types_converter = create_categorical_converter(valid_ent_types, zero_padding=False) token_position_fe, tp_meta = generate_token_position_feature_extractor( props) attention_ent_meta, attention_converters = get_categorical_meta_converters( _get_features(props, valid_ne_types, "attention")) attention_meta = AttentionFeaturesMeta(tp_meta, attention_ent_meta) classifier_meta, classifier_converters = get_categorical_meta_converters( _get_features(props, valid_ne_types)) feature_extractor = NETFeatureExtractor(token_feature_extractor, token_position_fe, attention_converters, classifier_converters, types_converter, types_mapping) # no encoder meta in this task metas = Metas(get_empty_basic_meta(), attention_meta, classifier_meta) return feature_extractor, metas, token_features_meta
def generate_feature_extractor( docs: Iterable[Document], props: dict, shared_feature_extractor: SpansCommonFeatureExtractor): strategy = _get_sampling_strategy(props) arc_converter = create_categorical_converter( strategy.get_possible_arc_types(docs), zero_padding=False) token_position_fe, tp_meta = generate_token_position_feature_extractor( props) attention_token_meta = tp_meta.namespaced('head') + tp_meta.namespaced( 'dep') attention_arc_meta, attention_converters = get_categorical_meta_converters( _get_arc_level_features(props, "attention")) attention_meta = AttentionFeaturesMeta(attention_token_meta, attention_arc_meta) classifier_meta, classifier_converters = get_categorical_meta_converters( _get_arc_level_features(props)) feature_extractor = ParserFeatureExtractor(shared_feature_extractor, arc_converter, token_position_fe, attention_converters, classifier_converters, strategy) return feature_extractor, Metas(get_empty_basic_meta(), attention_meta, classifier_meta)
def generate_feature_extractor(docs: Iterable[Document], props: dict, char_padding_size: int = 0): types_to_unquote = props.get("types_to_unquote", []) unquote_prob = props.get("prob_to_unquote", 0.0) if types_to_unquote and unquote_prob: # concat augmented docs with original ones to be sure all possible features are processed by FE factories augmentor = EntitiesUnquoteAugmentor(1.0, types_to_unquote) prev_docs = docs docs = FuncIterable(lambda: chain(prev_docs, map(augmentor.transform, prev_docs))) token_feature_extractor, token_features_meta = generate_token_feature_extractor(docs, props, char_padding_size) ne_feature_extractor, ne_meta = generate_ne_feature_extractor(docs, props) token_features_meta.basic_meta += ne_meta ent_types = collect_entities_types(docs) labelling_strategy = get_labelling_strategy(props.get("labelling_strategy", "BIO")) labels_converter = create_categorical_converter( labelling_strategy.get_possible_categories(ent_types), zero_padding=False ) prob_augmentor = EntitiesUnquoteAugmentor(unquote_prob, types_to_unquote) feature_extractor = NERFeatureExtractor( token_feature_extractor, ne_feature_extractor, labelling_strategy, labels_converter, prob_augmentor) return feature_extractor, token_features_meta
def _init_char_level_features(docs: Iterable[Document], props: dict, char_padding_size: int): char_level_features = {} if props.get("char_embedding_size", -1) > 0: chars = collect_chars_set(docs) char_level_features['chars'] = { 'converter': create_categorical_converter(chars, has_oov=True), 'embedding_size': props["char_embedding_size"], 'padding_size': char_padding_size } return char_level_features
def _get_encoder_features(props, docs): encoder_features = {} encoder_features.update( create_feature( 'encoder_entity_types', props, create_categorical_converter(collect_entities_types(docs), zero_padding=True))) encoder_features.update( create_feature( 'encoder_entity_ne', props, create_categorical_converter(collect_entities_types( docs, extras=True).union('O'), zero_padding=True, has_oov=True, oov_object='O'))) speech_types = props.get('speech_types', []) speech_size = props.get('speech_size', -1) if speech_size >= 0: for speech_type in speech_types: encoder_features.update( create_feature( 'encoder_' + speech_type, props, create_categorical_converter({True, False}, zero_padding=True), 'speech')) feats_types = props.get('morph_feats_list', []) feats_size = props.get('morph_feats_size', -1) if feats_size >= 0: for feat_name in feats_types: encoder_features.update( create_feature( 'encoder_' + feat_name, props, create_categorical_converter(collect_feature_labels( docs, feat_name), zero_padding=True), 'morph_feats')) return encoder_features
def _get_features(props): features = {} size = props.get("token_position_size", -1) if size >= 0: feature = { "converter": create_signed_integers_converter(props["max_word_distance"]) } if size > 0: feature["embedding_size"] = size features["token_position"] = feature size = props.get("token_log_position_size", -1) if size >= 0: feature = { "converter": create_signed_log_integers_converter(props["max_word_distance"]) } if size > 0: feature["embedding_size"] = size features["token_log_position"] = feature size = props.get("sent_position_size", -1) if size >= 0: feature = { "converter": create_signed_integers_converter(props["max_sent_distance"]) } if size > 0: feature["embedding_size"] = size features["sent_position"] = feature size = props.get("at_root_dt_path_size", -1) if size >= 0: feature = {"converter": create_categorical_converter({False, True})} if size > 0: feature["embedding_size"] = size features["at_root_dt_path"] = feature size = props.get("root_dt_path_position_size", -1) if size >= 0: feature = { "converter": create_unsigned_integers_converter(props["max_dt_depth"], additional_labels={False}) } if size > 0: feature["embedding_size"] = size features["root_dt_path_position"] = feature return features
def _get_features(props, ne_types, name_postfix="classifier"): features = {} ne_type_feature_name = "ne_type_in_{}".format(name_postfix) ne_type_size = props.get(ne_type_feature_name + "_size", -1) if ne_type_size >= 0: features[ne_type_feature_name] = { "converter": create_categorical_converter(ne_types) } if ne_type_size > 0: features[ne_type_feature_name]["embedding_size"] = ne_type_size return features
def test_custom_oov_in_set(self): categories_set = {"$HABITAT$", "$BACTERIA$", "hello", "laboratory"} converter = create_categorical_converter(categories_set, zero_padding=False, has_oov=True, oov_object="hello") indexed_categories = { "$BACTERIA$": 0, "$HABITAT$": 1, "hello": 2, "laboratory": 3, '1': 2, 1: 2, 'privet': 2, } converter_indexed_categories = {key: converter[key] for key in indexed_categories} self.assertEqual(indexed_categories, converter_indexed_categories)
def test_categorical_converter_1(self): categories_set = {"$HABITAT$", "$BACTERIA$", "hello", "laboratory"} converter = create_categorical_converter(categories_set, zero_padding=False, has_oov=False) indexed_categories = { "$BACTERIA$": 0, "$HABITAT$": 1, "hello": 2, "laboratory": 3 } converter_indexed_categories = {key: converter[key] for key in indexed_categories} self.assertEqual(indexed_categories, converter_indexed_categories) error_keys = ["$OOV$", "$PADDING$", 0, 1, "privet"] for key in error_keys: self.assertRaises(KeyError, getitem, converter, key)
def test_categorical_converter_4(self): categories_set = {"$HABITAT$", "$BACTERIA$", "hello", "laboratory"} converter = create_categorical_converter(categories_set, zero_padding=False, has_oov=True) indexed_categories = { "$BACTERIA$": 0, "$HABITAT$": 1, "hello": 2, "laboratory": 3, "$OOV$": 4, '1': 4, 1: 4, 'privet': 4, } converter_indexed_categories = {key: converter[key] for key in indexed_categories} self.assertEqual(indexed_categories, converter_indexed_categories) self.assertRaises(KeyError, getitem, converter, "$PADDING$")
def test_categorical_converter_3(self): categories_set = {"$HABITAT$", "$BACTERIA$", "hello", "laboratory"} converter = create_categorical_converter(categories_set, zero_padding=True, has_oov=True) indexed_categories = { "$PADDING$": 0, "$BACTERIA$": 1, "$HABITAT$": 2, "hello": 3, "laboratory": 4, "$OOV$": 5, '1': 5, 1: 5, 'privet': 5, } converter_indexed_categories = {key: converter[key] for key in indexed_categories} self.assertEqual(indexed_categories, converter_indexed_categories)
def generate_ne_feature_extractor(docs: Iterable[Document], props: dict): features = {} labelling_strategy = None if props.get("ne_emb_size", -1) >= 0: types = collect_entities_types(docs, extras=True) labelling_strategy = get_labelling_strategy( props.get("ne_labelling_strategy", "IO")) features['ne'] = { 'converter': create_categorical_converter( labelling_strategy.get_possible_categories(types), has_oov=True) } if props["ne_emb_size"] != 0: features["ne"]['embedding_size'] = props["ne_emb_size"] meta, converters = get_categorical_meta_converters(features) return NEFeatureExtractor(converters, labelling_strategy), meta
def generate_gazetteers_feature_extractors(props: dict): features = {} gazetteer_feature_extractors = {} converter = create_categorical_converter({True, False}, zero_padding=True, has_oov=False) for index, config in enumerate(props.get('gazetteers', [])): gazetteer_name = f"gazetteer_{index}" features[gazetteer_name] = {'converter': converter} if config.get('emb_size', -1) > 0: features[gazetteer_name]['embedding_size'] = config['emb_size'] gazetteer = _read_gazetteer(config["path"]) processor = StandardTokenProcessor.from_props(config) gazetteer = set(map(processor, gazetteer)) gazetteer_feature_extractors[ gazetteer_name] = GazetteerFeatureExtractor( gazetteer, processor, converter, config.get("lemmatize", False)) meta, _ = get_categorical_meta_converters(features) return meta, gazetteer_feature_extractors
def _get_relation_level_features(props, rel_arg_types, entities_types, name_postfix="classifier"): dual_features = {} single_features = {} rel_args_feature_name = "rel_args_in_{}".format(name_postfix) rel_args_size = props.get("rel_args_in_{}_size".format(name_postfix), -1) if rel_args_size >= 0: single_features[rel_args_feature_name] = { "converter": create_categorical_converter(rel_arg_types) } if rel_args_size != 0: single_features[rel_args_feature_name][ "embedding_size"] = rel_args_size entities_types_feature_name = "entities_types_in_{}".format(name_postfix) entities_types_size = props.get(entities_types_feature_name + "_size", -1) if entities_types_size >= 0: dual_features[entities_types_feature_name] = { "converter": create_categorical_converter(entities_types) } if entities_types_size > 0: dual_features[entities_types_feature_name][ "embedding_size"] = entities_types_size token_distance_feature_name = "entities_token_distance_in_{}".format( name_postfix) token_distance_size = props.get(token_distance_feature_name + "_size", -1) if token_distance_size >= 0: single_features[token_distance_feature_name] = { "converter": create_unsigned_integers_converter( props["max_token_entities_distance"]) } if token_distance_size != 0: single_features[token_distance_feature_name][ "embedding_size"] = token_distance_size token_log_distance_feature_name = "entities_token_log_distance_in_{}".format( name_postfix) token_log_distance_size = props.get( token_log_distance_feature_name + "_size", -1) if token_log_distance_size >= 0: single_features[token_log_distance_feature_name] = { "converter": create_unsigned_log_integers_converter( props["max_token_entities_distance"]) } if token_log_distance_size != 0: single_features[token_log_distance_feature_name][ "embedding_size"] = token_log_distance_size sent_distance_feature_name = "entities_sent_distance_in_{}".format( name_postfix) sent_distance_size = props.get(sent_distance_feature_name + "_size", -1) if sent_distance_size >= 0: single_features[sent_distance_feature_name] = { "converter": create_unsigned_integers_converter( props["max_sent_entities_distance"]) } if sent_distance_size != 0: single_features[sent_distance_feature_name][ "embedding_size"] = sent_distance_size rel_direction_feature_name = "rel_dir_in_{}".format(name_postfix) rel_direction_size = props.get(rel_direction_feature_name + "_size", -1) if rel_direction_size >= 0: categories = {"e1_e2", "e2_e1", "e1_in_e2", "e2_in_e1"} single_features[rel_direction_feature_name] = { "converter": create_categorical_converter(categories) } if rel_direction_size != 0: single_features[rel_direction_feature_name][ "embedding_size"] = rel_direction_size return duplicate_features_config(dual_features, single_features)
def _init_word_level_features(docs: Iterable[Document], props: dict, morph_features: List[str]): features = {} if props.get("pos_emb_size", -1) >= 0: pos_types = collect_feature_labels(docs, "pos") features['pos'] = { 'converter': create_categorical_converter(pos_types, has_oov=True) } if props["pos_emb_size"] != 0: features['pos']['embedding_size'] = props["pos_emb_size"] if props.get("borders_size", -1) >= 0: features['borders'] = { 'converter': create_categorical_converter({'start', 'in', 'end'}) } if props["borders_size"] != 0: features["borders"]['embedding_size'] = props["borders_size"] if props.get("dt_label_emb_size", -1) >= 0: dt_label_types = collect_feature_labels(docs, "dt_labels") features['dt_labels'] = { 'converter': create_categorical_converter(dt_label_types, has_oov=True) } if props["dt_label_emb_size"] != 0: features["dt_labels"]['embedding_size'] = props[ "dt_label_emb_size"] if props.get("dt_distance_emb_size", -1) >= 0: features['dt_head_distances'] = { 'converter': create_signed_integers_converter(props["max_dt_distance"]) } if props["dt_distance_emb_size"] != 0: features["dt_head_distances"]['embedding_size'] = props[ "dt_distance_emb_size"] if props.get("dt_depth_emb_size", -1) >= 0: features['dt_depths'] = { 'converter': create_unsigned_integers_converter(props["max_dt_depth"]) } if props["dt_depth_emb_size"] != 0: features["dt_depths"]['embedding_size'] = props[ "dt_depth_emb_size"] max_dt_delta = props.get("max_dt_delta", 0) if max_dt_delta: for direction in [Direction.FORWARD, Direction.BACKWARD]: key = "dt_deltas_" + direction.value emb_size = props.get(key + "_emb_size", -1) if emb_size >= 0: features[key] = { 'converter': create_signed_integers_converter( max_dt_delta, additional_labels={"$START$"}) } if emb_size != 0: features[key]['embedding_size'] = emb_size for direction in [Direction.FORWARD, Direction.BACKWARD]: key = "dt_breakups_" + direction.value emb_size = props.get(key + "_emb_size", -1) if emb_size >= 0: features[key] = { 'converter': create_categorical_converter(collect_feature_labels(docs, key)) } if emb_size != 0: features[key]['embedding_size'] = emb_size if props.get('morph_feats_emb_size', -1) >= 0: for feat in morph_features: feat_types = collect_feature_labels(docs, feat) if not feat_types: continue features[feat] = { 'converter': create_categorical_converter(feat_types, has_oov=True) } if props["morph_feats_emb_size"] != 0: features[feat]['embedding_size'] = props[ "morph_feats_emb_size"] return features
def _create_binary_feature(size): feature = {"converter": create_categorical_converter({True, False})} if size != 0: feature["embedding_size"] = size return feature