def __init__(self, ner_tokens, corpus_tokens, res_name="unknown"): assert ner_tokens and corpus_tokens and res_name self.res_name = res_name print "Calculating statistics .... " corpus_tokens = [t for t in corpus_tokens] self.corpus_defined_entities = len(corpus_tokens) self.intersections = 0 self.exact_match, self._exact_match = 0, [] self.single_entity, self._single_entity = 0, [] self.multiple_entities, self._multiple_entities = 0, [] self.noise_in_entity, self._noise_in_entity = 0, [] self.not_completed_entity, self._not_completed_entity = 0, [] self.exact_match_type_error, self._exact_match_type_error = 0, [] self.exact_match_no_type_error, self._exact_match_no_type_error = 0, [] self.wrong_grouped_entities, self._wrong_grouped_entities= 0, [] ner_tokens_set = TokenSet(ner_tokens) for t in ner_tokens_set.tokens(TokenSet.IntersectedTokens(corpus_tokens)): ner_off1, ner_off2 = t[1], t[1] + t[2] - 1 if len(t.matched) > 1: self.multiple_entities += 1 ss_off1, ss_off2 = t.matched[0][1], t.matched[0][1] + t.matched[0][2] - 1 es_off1, es_off2 = t.matched[-1][1], t.matched[-1][1] + t.matched[-1][2] - 1 if ss_off1 == ner_off1 and es_off2 == ner_off2: l = len(t.matched) - 1 for m in t.matched: l += m[2] if l == t[2]: self.wrong_grouped_entities += 1 else: self.single_entity += 1 c_off1, c_off2 = t.matched[0][1], t.matched[0][1] + t.matched[0][2] - 1 if ner_off1 == c_off1 and ner_off2 == c_off2 : self.exact_match += 1 self._exact_match.append(t) if t.matched[0][3] == t[3]: self.exact_match_no_type_error += 1 self._exact_match_no_type_error.append(t) else: self.exact_match_type_error += 1 self._exact_match_type_error.append(t) elif ner_off1 < c_off1 or ner_off2 > c_off2: self.noise_in_entity += 1 self._noise_in_entity.append(t) elif ner_off1 > c_off1 or ner_off2 < c_off2: self.not_completed_entity += 1 self._not_completed_entity.append(t) self.intersections += 1 print "Step 1 is done" self.not_in_corpus = len([e for e in ner_tokens_set.tokens(NotIntersectedSet(corpus_tokens))]) print "Step 2 is done" self.not_in_ner = len([e for e in TokenSet(corpus_tokens).tokens(NotIntersectedSet(ner_tokens))]) print "Step 3 is done"
def test_token_set(self): tokens = (["Amsterdam", 0, 9, Token.NE_LOC], ["FIFA", 30, 4, Token.NE_ORG], ["Something", -1, 4, 0], ["Amstel", 73, 6, Token.NE_MISC]) s = TokenSet(tokens) self.assertEqual(s[0], tokens[0]) self.assertEqual(s[1], tokens[1]) self.assertEqual(s[2], tokens[2]) self.assertEqual(s[3], tokens[3]) self.assertEqual(len(s), len(tokens)) def f(): return s[4] self.assertRaises(IndexError, f) def f(): s[0] = ("", 1, 1, 0) self.assertRaises(NotImplementedError, f) self.assertEqual(tokens[0] in s, True) self.assertEqual(tokens[1] in s, True) self.assertEqual(tokens[2] in s, True) self.assertEqual(tokens[3] in s, True) self.assertEqual(("", 2, 2, 2) in s, False) i = 0 for t in s: self.assertEqual(tokens[i], t) i += 1 self.assertEqual(len(tokens), i) i = 0 for t in s.tokens(): self.assertEqual(tokens[i], t) i += 1 self.assertEqual(len(tokens), i) r = [e for e in s.tokens(Token.NE_BITS)] self.assertEqual(len(tokens) - 1, len(r)) r = [e for e in s.tokens(Token.NE_ORG)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[1]) rule = TokenSet.UndefPosition() r = [e for e in s.tokens(rule)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[2]) rule = TokenSet.InInterval(20, 53) r = [e for e in s.tokens(rule)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[1]) rule = TokenSet.InInterval(20, 58) r = [e for e in s.tokens(rule)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[1]) rule = TokenSet.InInterval(20, 59) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[1]) self.assertEqual(r[1], tokens[-1]) def f1(): TokenSet.InInterval(-1, 52) def f2(): TokenSet.InInterval(0, 0) def f3(): TokenSet.InInterval(0, -1) self.assertRaises(AssertionError, f1) self.assertRaises(AssertionError, f2) self.assertRaises(AssertionError, f3) rule = TokenSet.NOT(TokenSet.UndefPosition()) r = [e for e in s.tokens(rule)] self.assertEqual(3, len(r)) self.assertEqual(r[0], tokens[0]) self.assertEqual(r[1], tokens[1]) self.assertEqual(r[2], tokens[3]) rule = TokenSet.OR(TokenSet.Type(Token.NE_ORG), TokenSet.Type(Token.NE_LOC)) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[0]) self.assertEqual(r[1], tokens[1]) rule = TokenSet.AND(TokenSet.InInterval(0, 35), TokenSet.Type(Token.NE_LOC)) r = [e for e in s.tokens(rule)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[0]) rule = TokenSet.AND( TokenSet.InInterval(0, 35), TokenSet.OR(TokenSet.Type(Token.NE_LOC), TokenSet.Type(Token.NE_ORG))) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[0]) self.assertEqual(r[1], tokens[1]) rule = TokenSet.NOT( TokenSet.OR(TokenSet.Type(Token.NE_ORG), TokenSet.Type(Token.NE_LOC))) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[2]) self.assertEqual(r[1], tokens[3]) tokens_to_compare = (["Amsterdam", 0, 9, Token.NE_LOC], ["FIFA", 30, 4, Token.NE_ORG]) rule = TokenSet.EqualTokens(tokens_to_compare) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], Token(tokens[0])) self.assertEqual(r[1], Token(tokens[1])) rule = TokenSet.NOT(TokenSet.EqualTokens(tokens_to_compare)) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[2]) self.assertEqual(r[1], tokens[3]) rule = TokenSet.OR(TokenSet.EqualTokens(tokens_to_compare), TokenSet.UndefPosition()) r = [e for e in s.tokens(rule)] self.assertEqual(3, len(r)) self.assertEqual(r[0], tokens[0]) self.assertEqual(r[1], tokens[1]) self.assertEqual(r[2], tokens[2]) rule = TokenSet.AND(TokenSet.EqualTokens(tokens_to_compare), TokenSet.UndefPosition()) r = [e for e in s.tokens(rule)] self.assertEqual(0, len(r)) tokens_to_compare = (["nsjdjsh", 0, 9, Token.NE_LOC], ["sdsd", 30, 4, Token.NE_ORG], ['dssd', -1, 4, 0]) rule = TokenSet.EqualByPositionTokens(tokens_to_compare) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], Token(tokens[0])) self.assertEqual(r[1], Token(tokens[1])) tokens_to_compare = (["ABC", 0, 72, Token.NE_LOC], ) rule = TokenSet.IntersectedTokens(tokens_to_compare) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], Token(tokens[0])) self.assertEqual(r[1], Token(tokens[1])) tokens_to_compare = (["ABC", 0, 72, Token.NE_LOC], ) rule = TokenSet.NOT(TokenSet.IntersectedTokens(tokens_to_compare)) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[2]) self.assertEqual(r[1], tokens[3]) tokens_to_compare = (["ABC", 33, 41, Token.NE_LOC], ) rule = TokenSet.IntersectedTokens(tokens_to_compare) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[1]) self.assertEqual(r[1], tokens[3])