def test_entities_with_nesting_collapse(self): expected_tokens = [ "Recurrence", "of", "$Bacteria$", "in", "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "Sardinian", "province", "of", "Cagliari", "had", "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "." ] expected_sentences = [Sentence(0, 6), Sentence(6, 30)] expected_entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 3, "Bacteria"), Entity("T3", 2, 3, "Bacteria"), Entity("T4", 4, 5, "Geographical"), Entity("T5", 16, 17, "Habitat"), Entity("T6", 16, 23, "Habitat"), Entity("T7", 19, 21, "Geographical"), Entity("T8", 22, 23, "Geographical"), Entity("T9", 28, 29, "Bacteria") ] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_relations = [ Relation(expected_entities[0], expected_entities[1], "Lives_in"), Relation(expected_entities[8], expected_entities[6], "Lives_in") ] expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, expected_relations) actual_doc = EntitiesCollapser({"Bacteria"}).transform(self.doc) self.assertEqual(expected_doc, actual_doc)
def test_collapsement_of_same_spans(self): tokens = ["Elon", "Musk", "is", "CEO", "of", "Tesla", "."] sentences = [Sentence(0, 7)] entities = [ Entity("_", 0, 2, "ELON"), Entity("_", 0, 2, "MUSK"), Entity("_", 5, 6, "COMP"), Entity("_", 5, 6, "ORG") ] input_doc = Document("_", tokens, sentences, [], entities) expected_tokens = ["$ELON$", "is", "CEO", "of", "$COMP$", "."] expected_sentences = [Sentence(0, 6)] expected_entities = [ Entity("_", 0, 1, "ELON"), Entity("_", 0, 1, "MUSK"), Entity("_", 4, 5, "COMP"), Entity("_", 4, 5, "ORG") ] expected_doc = Document("_", expected_tokens, expected_sentences, [], expected_entities) actual_doc = EntitiesCollapser({"ELON", "COMP"}).transform(input_doc) self.assertEqual(expected_doc, actual_doc)
def test_inner_entities_collapse(self): expected_tokens = [ "Recurrence", "of", "Pelecypod-associated", "cholera", "in", "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "$Geographical$", "of", "$Geographical$", "had", "onset", "of", "bacteriologically", "confirmed", "cholera", "." ] expected_sentences = [Sentence(0, 7), Sentence(7, 30)] expected_entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 4, "Bacteria"), Entity("T3", 3, 4, "Bacteria"), Entity("T4", 5, 6, "Geographical"), Entity("T5", 17, 18, "Habitat"), Entity("T6", 17, 23, "Habitat"), Entity("T7", 20, 21, "Geographical"), Entity("T8", 22, 23, "Geographical"), Entity("T9", 28, 29, "Bacteria") ] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_relations = [ Relation(expected_entities[0], expected_entities[1], "Lives_in"), Relation(expected_entities[8], expected_entities[6], "Lives_in") ] expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, expected_relations) actual_doc = EntitiesCollapser({"Geographical"}).transform(self.doc) self.assertEqual(expected_doc, actual_doc)
def test_ne_extras_collapse(self): nes = SortedSpansSet([ Entity("_", 0, 1, "left"), Entity("_", 2, 4, "same"), Entity("_", 3, 4, "include"), Entity("_", 5, 6, "same"), Entity("_", 15, 19, "intersect"), Entity("_", 17, 20, "include"), Entity("_", 22, 25, "intersect") ]) expected_nes = SortedSpansSet([ Entity("_", 0, 1, "left"), Entity("_", 2, 3, "same"), Entity("_", 2, 3, "include"), Entity("_", 4, 5, "same"), Entity("_", 14, 17, "intersect"), Entity("_", 16, 17, "include"), Entity("_", 16, 18, "intersect") ]) input_doc = self.doc.with_additional_extras({"ne": nes}) actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"}).transform(input_doc) actual_extras = actual_doc.extras self.assertDictEqual(actual_extras, {"ne": expected_nes})
def test_collapsing_with_ne(self): input_doc = self.doc.with_additional_extras({"ne": self.doc.entities}) input_doc = input_doc.without_relations().without_entities() entities = SortedSpansSet([ Entity("_", 0, 1, "left"), Entity("_", 2, 4, "same"), Entity("_", 3, 4, "include"), Entity("_", 5, 6, "same"), Entity("_", 15, 19, "intersect"), Entity("_", 17, 20, "include"), Entity("_", 22, 25, "intersect") ]) input_doc = input_doc.with_entities(entities) expected_tokens = [ "Recurrence", "of", "$Bacteria$", "in", "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "$Habitat$", "had", "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "." ] expected_sentences = [Sentence(0, 6), Sentence(6, 24)] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_nes = SortedSpansSet([ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 3, "Bacteria"), Entity("T3", 2, 3, "Bacteria"), Entity("T4", 4, 5, "Geographical"), Entity("T5", 16, 17, "Habitat"), Entity("T6", 16, 17, "Habitat"), Entity("T7", 16, 17, "Geographical"), Entity("T8", 16, 17, "Geographical"), Entity("T9", 22, 23, "Bacteria") ]) expected_entities = SortedSpansSet([ Entity("_", 0, 1, "left"), Entity("_", 2, 3, "same"), Entity("_", 2, 3, "include"), Entity("_", 4, 5, "same"), Entity("_", 14, 17, "intersect"), Entity("_", 16, 17, "include"), Entity("_", 16, 18, "intersect") ]) expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, extras={"ne": expected_nes}) actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"}, True).transform(input_doc) self.assertEqual(expected_doc, actual_doc)
def __init__(self, props: dict): super().__init__(props) self.props = props self._syntactic_fc = SyntacticFeatureComputer( self.props.get('morph_feats_list', DEFAULT_FEATS_LIST)) self._feature_computer = CompositeFeatureComputer( (self._syntactic_fc, EntityBasedFeatureComputer())) self._collapser = EntitiesCollapser( self.props.get("types_to_collapse", set()))
def _collapser_from_props(props: dict, ne=False): if props: from derek.data.entities_collapser import EntitiesCollapser collapser = EntitiesCollapser.from_props({ **props, "collapse_with_ne": ne }) print(f"Using {collapser}") return [collapser] else: return []
def test_vectors_features_collapse(self): vectors = [np.array([1, 2])] * 7 + [np.array([3, 4])] * 24 expected_vectors = \ [np.array([1, 2])] * 2 + [np.array([0, 0]), np.array([1, 2]), np.array([0, 0]), np.array([1, 2])] + \ [np.array([3, 4])] * 10 + [np.array([0, 0])] + [np.array([3, 4])] * 5 + [np.array([0, 0]), np.array([3, 4])] expected_tf = {"vectors": expected_vectors} input_doc = self.doc.with_additional_token_features( {"vectors": vectors}) actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"}).transform(input_doc) actual_tf = actual_doc.token_features self.assertSetEqual(set(actual_tf.keys()), set(expected_tf.keys())) self.assertSequenceEqual([x.tolist() for x in actual_tf["vectors"]], [x.tolist() for x in expected_tf["vectors"]])
def test_feats_features_collapse(self): feats = [{ "test_feat": "true" }, {}, { "yet": "1" }, { "another": "false" }, {}, { "test": "3" }, { "test": 4 }, {}, { "yet": "3" }, {}, {}, {}, { "another": "true" }, {}, {}, {}, { "test": "4" }, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, { "bacteria": "true" }, { "bacteria": "false" }] expected_feats = [{ "test_feat": "true" }, {}, {}, {}, {}, { "test": 4 }, {}, { "yet": "3" }, {}, {}, {}, { "another": "true" }, {}, {}, {}, { "test": "4" }, {}, {}, {}, {}, {}, {}, {}, { "bacteria": "false" }] expected_tf = {"feats": expected_feats} input_doc = self.doc.with_additional_token_features({"feats": feats}) actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"}).transform(input_doc) actual_tf = actual_doc.token_features self.assertDictEqual(actual_tf, expected_tf)
def test_pos_features_collapse(self): pos = [ "NNP", "IN", "NNP", "NN", "IN", "NNP", "DOT", "IN", "NNP", "OTHER", "OTHER", "NNP", "OTHER", "COMMA", "OTHER", "COMMA", "OTHER", "NNS", "IN", "DT", "JJ", "NN", "IN", "NNP", "VBD", "RB", "IN", "RB", "VBN", "NN", "DOT" ] expected_pos = [ "NNP", "IN", "$Bacteria$", "IN", "$Geographical$", "DOT", "IN", "NNP", "OTHER", "OTHER", "NNP", "OTHER", "COMMA", "OTHER", "COMMA", "OTHER", "$Habitat$", "VBD", "RB", "IN", "RB", "VBN", "$Bacteria$", "DOT", ] expected_tf = {"pos": expected_pos} input_doc = self.doc.with_additional_token_features({"pos": pos}) actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"}).transform(input_doc) actual_tf = actual_doc.token_features self.assertDictEqual(actual_tf, expected_tf)
def test_dt_features_collapse(self): dt_head_distances = [ 6, -1, 1, -2, -1, -1, 0, 2, -1, 0, -1, 1, -2, 11, 10, 9, 1, 7, -1, 2, 1, -3, -1, -1, -12, -1, -1, 1, 1, -3, -6 ] dt_labels = [ "nsubj", "prep", "nn", "pobj", "prep", "pobj", "ROOT", "prep", "pobj", "ROOT", "dep", "nn", "dep", "punct", "dep", "punct", "amod", "nsubj", "prep", "det", "amod", "pobj", "prep", "pobj", "null", "advmod", "dep", "advmod", "amod", "pobj", "punct" ] expected_dt_head_distances = [ 6, -1, 0, -1, 0, 0, 2, -1, 0, -1, 1, -2, 11, 10, 9, 1, 0, -12, -1, -1, 1, 1, 0, -6 ] expected_dt_labels = [ "nsubj", "prep", "$Bacteria$", "prep", "$Geographical$", "ROOT", "prep", "pobj", "ROOT", "dep", "nn", "dep", "punct", "dep", "punct", "amod", "$Habitat$", "null", "advmod", "dep", "advmod", "amod", "$Bacteria$", "punct" ] expected_tf = { "dt_labels": expected_dt_labels, "dt_head_distances": expected_dt_head_distances } input_doc = self.doc.with_additional_token_features({ "dt_head_distances": dt_head_distances, "dt_labels": dt_labels }) actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"}).transform(input_doc) actual_tf = actual_doc.token_features self.assertDictEqual(actual_tf, expected_tf)
def train(self, docs: Iterable[Document], unlabeled_docs: Iterable[Document] = None, early_stopping_callback: Callable[[NETClassifier, int], bool] = lambda c, e: False): feature_computer = SyntacticFeatureComputer( self.props.get('morph_feats_list', DEFAULT_FEATS_LIST)) if self.props.get("unify_similar_entities_types", False): grouper = chain_similar_entities get_bucket_for_sample = lambda s: int(s["chain_len"] == 1) else: grouper = chain_individual_entities get_bucket_for_sample = lambda s: s["seq_len"][0] // self.props[ "batcher"]["bucket_length"] grouper_collapser = _GrouperCollapser( CoreferenceChainGrouper(grouper), EntitiesCollapser(self.props.get("types_to_collapse", set()), collapse_with_ne=True)) docs_groups = FuncIterable(lambda: map( itemgetter(0, 1), map(grouper_collapser.prepare_doc_with_collapsing, docs))) collapsed_docs = FuncIterable(lambda: map(itemgetter(0), docs_groups)) precomputed_docs = FuncIterable(lambda: map( feature_computer.create_features_for_doc, collapsed_docs)) groups = FuncIterable(lambda: map(itemgetter(1), docs_groups)) char_padding_size = get_char_padding_size(self.props) feature_extractor, metas, token_meta = generate_feature_extractor( precomputed_docs, self.props, char_padding_size) feature_extractor = GroupingFeatureExtractor( feature_extractor, group_level_features=["labels_mask"]) # reuse because this task is kinda unary rel-ext task_graph_meta = NETTaskGraphMeta("NET", self.props, metas, feature_extractor.get_labels_size(), True) # we have only one graph graph, = build_graphs_with_shared_encoder( self.props, token_meta, [build_task_graph_meta(task_graph_meta)], rank=3) init = tf.global_variables_initializer() self._session.run(init) samples = list( feature_extractor.extract_features_from_docs( precomputed_docs, groups)) saver = tf.train.Saver(save_relative_paths=True) classifier = _Classifier(graph, feature_extractor, feature_computer, self._session, saver, grouper_collapser) batcher_factory = get_batcher_from_props( samples, self.props["batcher"], feature_extractor.get_padding_value_and_rank, True, True, get_bucket_for_sample) train_meta = TaskTrainMeta( "NET", graph, batcher_factory, { "learning_rate": get_decayed_lr(self.props["learning_rate"], self.props.get("lr_decay", 0)), "dropout_rate": get_const_controller(self.props.get("dropout", 1.0)) }, classifier, early_stopping_callback) train_for_samples(self._session, self.props["epoch"], [train_meta])