def test(self): doc = unittest.mock.Mock() stage = TypeSpecificStage(CombineEverything(), EntityType.GPE, EntityType.LOC) doc.mention_chains = [ MentionChain([ Mention('New York', '_DF_doc34', (141, 149), (22, 23), EntityType.LOC, 'Men1') ]), MentionChain([ Mention('New York City', '_DF_doc34', (146, 154), (24, 25), EntityType.GPE, 'Men2') ]), MentionChain([ Mention('NY Yankees', '_DF_doc34', (173, 181), (36, 37), EntityType.ORG, 'Men3') ]), MentionChain([ Mention('Ed Koch', '_DF_doc34', (186, 194), (51, 52), EntityType.PER, 'Men4') ]), MentionChain([ Mention('NYC', '_DF_doc34', (237, 245), (71, 72), EntityType.GPE, 'Men5') ]), ] stage.update(doc) self.assertEqual(3, len(doc.mention_chains)) self.assertEqual(sorted([1, 1, 3]), sorted(list(map(len, doc.mention_chains))))
def test(self): chains = [ MentionChain([ Mention('Henry', '_WL_doc34', (123, 128), (17, 17), EntityType.PER, 'Men1') ]), MentionChain([ Mention('Ed Smith', '_WL_doc34', (141, 149), (22, 23), EntityType.PER, 'Men2'), Mention('Ed', '_WL_doc34', (197, 199), (44, 44), EntityType.PER, 'Men3') ]) ] chains[0].entity = Entity('67', EntityType.PER, 'Henry', EntityOrigin.WLL) chains[1].entity = None doc = Document(chains[0].mentions + chains[1].mentions, DocType.WL, Lang.ENG, [], []) doc.mention_chains = chains buffer = io.StringIO() writer = OutputWriter(buffer, 'test', 0.75) writer.write(doc) buffer.seek(0) lines = buffer.readlines() line1 = "test\tMen1\tHenry\t_WL_doc34:123-128\t67\tPER\tNAM\t0.75" self.assertEqual(line1, lines[0].strip())
def test_update1(self): # first example from table 1 in https://www.aclweb.org/anthology/M95-1005 gt = { 'doc1': { (0, 1): Link(EntityType.PER, LinkType.LINK, 'NIL1', None), (2, 5): Link(EntityType.PER, LinkType.LINK, 'NIL1', None), (8, 11): Link(EntityType.PER, LinkType.LINK, 'NIL1', None), (14, 17): Link(EntityType.PER, LinkType.LINK, 'NIL1', None), }, } doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain([ Mention('1', 'doc1', (0, 1), (), EntityType.PER, 'NIL10'), Mention('2', 'doc1', (2, 5), (), EntityType.PER, 'NIL10'), ]), MentionChain([ Mention('3', 'doc1', (8, 11), (), EntityType.PER, 'NIL11'), Mention('4', 'doc1', (14, 17), (), EntityType.PER, 'NIL11'), ]), ] scorer = CorefScorer(gt, CorefMetric.MUC) scorer.update(doc) self.assertEqual(2, scorer.recall_numerator) self.assertEqual(3, scorer.recall_denominator) self.assertEqual(2, scorer.precision_numerator) self.assertEqual(2, scorer.precision_denominator)
def test(self): doc = unittest.mock.Mock() stage = LanguageSpecificStage(CombineEverything(), Lang.ENG) doc.lang = Lang.AKA doc.mention_chains = [ MentionChain([ Mention('Ed Smith', '_DF_doc34', (141, 149), (22, 23), EntityType.PER, 'Men1') ]), MentionChain([ Mention('Ed Smith', '_DF_doc34', (146, 154), (24, 25), EntityType.PER, 'Men2') ]), MentionChain([ Mention('Ben Smith', '_DF_doc34', (173, 181), (36, 37), EntityType.PER, 'Men3') ]), MentionChain([ Mention('ed Smith', '_DF_doc34', (186, 194), (51, 52), EntityType.PER, 'Men4') ]), MentionChain([ Mention('Ed Smith', '_DF_doc34', (237, 245), (71, 72), EntityType.ORG, 'Men5') ]), ] stage.update(doc) self.assertEqual(5, len(doc.mention_chains)) doc.lang = Lang.ENG doc.mention_chains = [ MentionChain([ Mention('Ed Smith', '_DF_doc34', (141, 149), (22, 23), EntityType.PER, 'Men1') ]), MentionChain([ Mention('Ed Smith', '_DF_doc34', (146, 154), (24, 25), EntityType.PER, 'Men2') ]), MentionChain([ Mention('Ben Smith', '_DF_doc34', (173, 181), (36, 37), EntityType.PER, 'Men3') ]), MentionChain([ Mention('ed Smith', '_DF_doc34', (186, 194), (51, 52), EntityType.PER, 'Men4') ]), MentionChain([ Mention('Ed Smith', '_DF_doc34', (237, 245), (71, 72), EntityType.ORG, 'Men5') ]), ] stage.update(doc) self.assertEqual(1, len(doc.mention_chains)) self.assertEqual(5, len(doc.mention_chains[0].mentions))
def testWithNoStages(self): doc = unittest.mock.Mock() doc.mentions = [ Mention('Ed Smith', '_DF_doc34', (141, 149), (22, 23), EntityType.PER, 'Men1'), Mention('Ed Smith', '_DF_doc34', (146, 154), (24, 25), EntityType.PER, 'Men2'), Mention('Ben Smith', '_DF_doc34', (173, 181), (36, 37), EntityType.PER, 'Men3'), ] coref = CascadeCoref([]) coref.coref(doc) self.assertEqual(3, len(doc.mention_chains))
def test_across_entity_type(self): doc = unittest.mock.Mock() doc.mention_chains = [ MentionChain([ Mention('South Carolina', '_DF_doc34', (141, 149), (22, 23), EntityType.GPE, 'Men1'), ]), MentionChain([ Mention('SC', '_DF_doc34', (146, 154), (24, 25), EntityType.ORG, 'Men6') ]), ] stage = AcronymStage(2) stage.update(doc) self.assertEqual(2, len(doc.mention_chains))
def test_no_match(self): v = FeatureVector() entity = Entity('1', EntityType.GPE, 'Nueva York', EntityOrigin.GEO) chain = MentionChain( [Mention('New York', 'doc1', (), (), EntityType.GPE)]) ExactMatchFeature().extract(chain, entity, None, v) self.assertFalse(v.data[0])
def test_multiple_matches(self): doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain( [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]), ] doc.mention_chains[0].candidates = [ Entity('122', EntityType.PER, 'John Smith', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_Smith']), Entity('123', EntityType.PER, 'John Smith', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_Smith']), Entity('124', EntityType.PER, 'Jake Smith', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_P_Smith']), ] WikipediaResolver().resolve(doc) self.assertEqual(1, len(doc.mention_chains)) self.assertEqual(2, len(doc.mention_chains[0].candidates)) self.assertIsNone(doc.mention_chains[0].entity)
def test_simple(self): mention = Mention("Ted", "IL9_SM_001", (4, 8), (0, 1), EntityType.PER) ia = InProcessIncremental() ia.assign(mention) self.assertEqual('M1', mention.id) ia.assign(mention) self.assertEqual('M2', mention.id)
def test_positive(self): v = FeatureVector() entity = Entity('1', EntityType.PER, 'John Smith', EntityOrigin.WLL) chain = MentionChain( [Mention('Jep Smith', 'doc1', (), (), EntityType.PER)]) LastNameFeature().extract(chain, entity, None, v) self.assertTrue(v.data[0])
def test_not_multi_token_name(self): v = FeatureVector() entity = Entity('1', EntityType.PER, 'John Smith', EntityOrigin.WLL) chain = MentionChain( [Mention('Smith', 'doc1', (), (), EntityType.PER)]) LastNameFeature().extract(chain, entity, None, v) self.assertFalse(v.data[0])
def test(self): doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain( [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]), ] doc.mention_chains[0].candidates = [ Entity('122', EntityType.PER, 'John Smith', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_Smith']), Entity('123', EntityType.PER, 'John Smith', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_H_Smith']), Entity('124', EntityType.PER, 'Jake Smith', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_Smith']), ] CascadeResolver([ExactNameResolver(), WikipediaResolver()]).resolve(doc) self.assertEqual(1, len(doc.mention_chains)) self.assertEqual(2, len(doc.mention_chains[0].candidates)) self.assertEqual('122', doc.mention_chains[0].entity.id)
def test(self): v = FeatureVector() entity = Entity('1', EntityType.GPE, 'New York City', EntityOrigin.GEO) entity.names = {'NYC', 'New York'} chain = MentionChain( [Mention('Nueva York', 'doc1', (), (), EntityType.GPE)]) SharedTokensFeature().extract(chain, entity, None, v) self.assertAlmostEqual(0.5, v.data[0])
def test_merge(self): chains = [ MentionChain([ Mention('1', '_NW_1', (), (), EntityType.PER, 'Men1'), Mention('4', '_NW_1', (), (), EntityType.PER, 'Men4') ]), MentionChain( [Mention('2', '_NW_1', (), (), EntityType.PER, 'Men2')]), MentionChain( [Mention('3', '_NW_1', (), (), EntityType.PER, 'Men3')]), ] doc = unittest.mock.Mock() doc.mention_chains = chains stage = CorefStageTest.DummyStage() stage.merge(doc, [chains[0], chains[2]]) self.assertEqual(2, len(doc.mention_chains)) self.assertEqual(3, len(doc.mention_chains[-1].mentions))
def test_match(self): v = FeatureVector() entity = Entity('1', EntityType.GPE, 'New York', EntityOrigin.GEO) entity.names = {'New York', 'New York City', 'NYC'} chain = MentionChain( [Mention('Nueva York', 'doc1', (), (), EntityType.GPE)]) chain.mentions[0].translate_string = 'new york' ExactMatchFeature().extract(chain, entity, None, v) self.assertTrue(v.data[0])
def test_negative(self): v = FeatureVector() entity = Entity('1', EntityType.GPE, 'New York', EntityOrigin.GEO, urls=['http://en.wikipedia.org/wiki/New_York']) chain = MentionChain( [Mention('New York City', 'doc1', (), (), EntityType.GPE)]) WikipediaFeature().extract(chain, entity, None, v) self.assertFalse(v.data[0])
def test(self): doc = unittest.mock.Mock() doc.mention_chains = [ MentionChain([ Mention('Ed Smith', '_DF_doc34', (141, 149), (22, 23), EntityType.PER, 'Men1') ]), MentionChain([ Mention('Smith', '_DF_doc34', (146, 154), (24, 25), EntityType.PER, 'Men2') ]), MentionChain([ Mention('Ben Smithy', '_DF_doc34', (173, 181), (36, 37), EntityType.PER, 'Men3') ]), MentionChain([ Mention('ed smith', '_DF_doc34', (186, 194), (51, 52), EntityType.PER, 'Men4') ]), MentionChain([ Mention('Tony Smith', '_DF_doc34', (237, 245), (71, 72), EntityType.ORG, 'Men5') ]), MentionChain([ Mention('Smith Jones', '_DF_doc34', (298, 306), (36, 37), EntityType.PER, 'Men6') ]), ] stage = SingleTokenMatchStage(index=-1) stage.update(doc) self.assertEqual(4, len(doc.mention_chains)) self.assertEqual(sorted([1, 1, 1, 3]), sorted(list(map(len, doc.mention_chains))))
def test(self): doc = unittest.mock.Mock() doc.mention_chains = [ MentionChain([ Mention('South Carolina', '_DF_doc34', (141, 149), (22, 23), EntityType.GPE, 'Men1'), Mention('south carolina', '_DF_doc34', (173, 181), (36, 37), EntityType.GPE, 'Men3') ]), MentionChain([ Mention('SC', '_DF_doc34', (146, 154), (24, 25), EntityType.GPE, 'Men2') ]), MentionChain([ Mention('SC', '_DF_doc34', (146, 154), (24, 25), EntityType.ORG, 'Men6') ]), MentionChain([ Mention('ed Smith', '_DF_doc34', (186, 194), (51, 52), EntityType.PER, 'Men4'), Mention('Ed Smith', '_DF_doc34', (237, 245), (71, 72), EntityType.PER, 'Men5') ]), ] stage = AcronymStage(2) stage.update(doc) self.assertEqual(3, len(doc.mention_chains)) self.assertEqual({1, 2, 3}, {len(x.mentions) for x in doc.mention_chains})
def test(self): doc = unittest.mock.Mock() doc.mention_chains = [ MentionChain([ Mention('Ed Smith', '_DF_doc34', (141, 149), (22, 23), EntityType.PER, 'Men1') ]), MentionChain([ Mention('Ed Smith', '_DF_doc34', (146, 154), (24, 25), EntityType.PER, 'Men2') ]), MentionChain([ Mention('Ben Smith', '_DF_doc34', (173, 181), (36, 37), EntityType.PER, 'Men3') ]), MentionChain([ Mention('ed Smith', '_DF_doc34', (186, 194), (51, 52), EntityType.PER, 'Men4') ]), MentionChain([ Mention('Ed Smith', '_DF_doc34', (237, 245), (71, 72), EntityType.ORG, 'Men5') ]), ] stage = ExactMatchStage() stage.update(doc) self.assertEqual(3, len(doc.mention_chains)) self.assertEqual(sorted([1, 1, 3]), sorted(list(map(len, doc.mention_chains)))) for chain in doc.mention_chains: if chain.name.lower( ) == 'ed smith' and chain.type == EntityType.PER: self.assertEqual({'Men1', 'Men2', 'Men4'}, {m.id for m in chain.mentions}) self.assertEqual({(141, 149), (146, 154), (186, 194)}, {m.offsets for m in chain.mentions})
def test_no_matches(self): doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain( [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]), ] doc.mention_chains[0].candidates = [ Entity('122', EntityType.PER, 'John', EntityOrigin.WLL) ] ExactNameResolver().resolve(doc) self.assertEqual(1, len(doc.mention_chains)) self.assertEqual(1, len(doc.mention_chains[0].candidates)) self.assertIsNone(doc.mention_chains[0].entity)
def test_one_match(self): doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain( [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]), ] entity1 = Entity('122', EntityType.PER, 'John', EntityOrigin.WLL) entity1.names = {'John', 'J. Smith', 'john smith'} doc.mention_chains[0].candidates = [ entity1, Entity('123', EntityType.PER, 'Not John', EntityOrigin.WLL) ] ExactNameResolver().resolve(doc) self.assertEqual(1, len(doc.mention_chains)) self.assertEqual(2, len(doc.mention_chains[0].candidates)) self.assertEqual(entity1, doc.mention_chains[0].entity)
def test_one_match(self): doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain( [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]), ] entity1 = Entity('122', EntityType.PER, 'John', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_Smith']) doc.mention_chains[0].candidates = [ entity1, Entity('123', EntityType.PER, 'Not John', EntityOrigin.WLL) ] WikipediaResolver().resolve(doc) self.assertEqual(1, len(doc.mention_chains)) self.assertEqual(2, len(doc.mention_chains[0].candidates)) self.assertEqual(entity1, doc.mention_chains[0].entity)
def test(self): classifier = self.trainClassifier() extractor = EntityFeatureExtractor(SimpleFeature()) resolver = SvmResolver(classifier, extractor) doc = unittest.mock.Mock() doc.mention_chains = [ MentionChain( [Mention('John Smith', 'doc1', (), (), EntityType.PER)]), ] doc.mention_chains[0].candidates = [ Entity('122', EntityType.PER, 'John Smith', EntityOrigin.WLL), Entity('123', EntityType.PER, 'John Smith', EntityOrigin.WLL), Entity('124', EntityType.PER, 'John P. Smith', EntityOrigin.WLL), Entity('125', EntityType.PER, 'Jake Smith', EntityOrigin.WLL), ] resolver.resolve(doc) self.assertEqual('124', doc.mention_chains[0].entity.id)
def test(self): gt = { 'doc1': { (0, 2): Link(EntityType.PER, LinkType.LINK, ['123', '122'], None), (4, 8): Link(EntityType.PER, LinkType.NIL, [], 'NIL999'), (10, 12): Link(EntityType.PER, LinkType.LINK, ['222'], None), (16, 17): Link(EntityType.PER, LinkType.LINK, ['333'], None), (18, 19): Link(EntityType.PER, LinkType.LINK, ['444'], None), } } doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain([ Mention('', 'doc1', (0, 2), (), EntityType.PER), Mention('', 'doc1', (3, 7), (), EntityType.PER), Mention('', 'doc1', (16, 17), (), EntityType.PER), ]), MentionChain([Mention('', 'doc1', (4, 8), (), EntityType.PER)]), MentionChain([Mention('', 'doc1', (10, 12), (), EntityType.PER)]), MentionChain([Mention('', 'doc1', (18, 19), (), EntityType.PER)]), ] doc.mention_chains[0].candidates = [ Entity('122', EntityType.PER, '', EntityOrigin.WLL) ] doc.mention_chains[0].entity = doc.mention_chains[0].candidates[0] doc.mention_chains[1].candidates = [ Entity('147', EntityType.PER, '', EntityOrigin.WLL) ] doc.mention_chains[1].entity = doc.mention_chains[1].candidates[0] doc.mention_chains[2].candidates = [ Entity('198', EntityType.PER, '', EntityOrigin.WLL), Entity('222', EntityType.PER, '', EntityOrigin.WLL), ] doc.mention_chains[2].entity = None doc.mention_chains[3].candidates = [ Entity('17', EntityType.PER, '', EntityOrigin.WLL) ] doc.mention_chains[3].entity = doc.mention_chains[3].candidates[0] scorer = ResolverScorer(gt) scorer.update(doc) self.assertEqual( 2, scorer.report.num_mentions_with_correct_candidate[EntityType.PER]) self.assertEqual( 1, scorer.report.num_mentions_correct_entity[EntityType.PER])
def get_example1_test_data(self): # first example from "Algorithms for scoring coreference chains" gt = { 'doc1': { (0, 1): Link(EntityType.PER, LinkType.LINK, 'NIL1', None), (0, 2): Link(EntityType.PER, LinkType.LINK, 'NIL1', None), (0, 3): Link(EntityType.PER, LinkType.LINK, 'NIL1', None), (0, 4): Link(EntityType.PER, LinkType.LINK, 'NIL1', None), (0, 5): Link(EntityType.PER, LinkType.LINK, 'NIL1', None), (0, 6): Link(EntityType.PER, LinkType.LINK, 'NIL2', None), (0, 7): Link(EntityType.PER, LinkType.LINK, 'NIL2', None), (0, 8): Link(EntityType.PER, LinkType.LINK, 'NIL3', None), (0, 9): Link(EntityType.PER, LinkType.LINK, 'NIL3', None), (0, 10): Link(EntityType.PER, LinkType.LINK, 'NIL3', None), (0, 11): Link(EntityType.PER, LinkType.LINK, 'NIL3', None), (0, 12): Link(EntityType.PER, LinkType.LINK, 'NIL3', None), }, } doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain([ Mention('1', 'doc1', (0, 1), (), EntityType.PER, 'NIL10'), Mention('2', 'doc1', (0, 2), (), EntityType.PER, 'NIL10'), Mention('3', 'doc1', (0, 3), (), EntityType.PER, 'NIL10'), Mention('4', 'doc1', (0, 4), (), EntityType.PER, 'NIL10'), Mention('5', 'doc1', (0, 5), (), EntityType.PER, 'NIL10'), ]), MentionChain([ Mention('6', 'doc1', (0, 6), (), EntityType.PER, 'NIL11'), Mention('7', 'doc1', (0, 7), (), EntityType.PER, 'NIL11'), Mention('8', 'doc1', (0, 8), (), EntityType.PER, 'NIL11'), Mention('9', 'doc1', (0, 9), (), EntityType.PER, 'NIL11'), Mention('10', 'doc1', (0, 10), (), EntityType.PER, 'NIL11'), Mention('11', 'doc1', (0, 11), (), EntityType.PER, 'NIL11'), Mention('12', 'doc1', (0, 12), (), EntityType.PER, 'NIL11'), ]), ] return gt, doc