def save_noise_entities(ner_id, corpus, model = None): a = load_all_recognized_tokens(ner_id, corpus.lang, model) ct = corpus.ne_tokens() if model == None: model = 'default' f1 = open(ner_id + "." + corpus.lang + "." + model + ".multiple.txt", "w") f2 = open(ner_id + "." + corpus.lang + "." + model + "-single.txt", "w") for t in TokenSet(a).tokens(TokenSet.MatchIntersectedSet(ct)): ner_off1, ner_off2 = t[1], t[1] + t[2] - 1 # multiple intersection if len(t.matched) > 1: ss_off1, ss_off2 = t.matched[0][1], t.matched[0][1] + t.matched[0][2] - 1 es_off1, es_off2 = t.matched[-1][1], t.matched[-1][1] + t.matched[-1][2] - 1 f1.write(str(t) + "\n") for m in t.matched: f1.write(">>> " + str(Token(m)) + "\n") else: # single intersection c_off1, c_off2 = t.matched[0][1], t.matched[0][1] + t.matched[0][2] - 1 # exact matched tokens are not interested if ner_off1 == c_off1 and ner_off2 == c_off2 : continue f2.write(str(t) + "\n") for m in t.matched: f2.write(">>> " + str(Token(m)) + "\n") f1.close() f2.close()
def calc_recognized_types_distribution_for_wrongly_recognized_entities(ner_id, lang = 'nl', model = None): t = load_matched_wrongtyped_tokens(ner_id, lang, model) s = TokenSet(t) misc = s.tokens(Token.NE_MISC) loc = s.tokens(Token.NE_LOC) per = s.tokens(Token.NE_PER) org = s.tokens(Token.NE_ORG) print "======== %s Recognized entities type distribution :" % ner_id print "LOCATIONS : %4d %3d" % (len(loc), (len(loc)*100)/len(t)) print "PERSONS : %4d %3d" % (len(per), (len(per)*100)/len(t)) print "ORGANIZATION : %4d %3d" % (len(org), (len(org)*100)/len(t)) print "MISC : %4d %3d" % (len(misc),(len(misc)*100)/len(t)) print "=============================" print "AMOUNT : %4d 100" % len(t)
def calc_types_distribution_for_completly_wrongly_recognized_entities(ner_id, lang = 'nl', model = None): class NotMatchLocationSet(TokenSet.MatchSet): def __init__(self, tokens): super(self.__class__, self).__init__(tokens, False) def match_tokens(self, token1, token2): return token1[1] >= 0 and token2[1] >= 0 and (token1[1] != token2[1] or token1[2] != token2[2]) a = load_all_recognized_tokens(ner_id, lang, model) r = load_all_matched_tokens(ner_id, lang, model) nm = TokenSet.NotMatchSet(TokenSet.MatchLocationSet(r)) s = TokenSet(TokenSet(a).tokens(nm)) misc = s.tokens(Token.NE_MISC) loc = s.tokens(Token.NE_LOC) per = s.tokens(Token.NE_PER) org = s.tokens(Token.NE_ORG) print "======== %s Recognized entities type distribution :" % ner_id print "LOCATIONS : %4d %3d" % (len(loc), (len(loc)*100)/len(s)) print "PERSONS : %4d %3d" % (len(per), (len(per)*100)/len(s)) print "ORGANIZATION : %4d %3d" % (len(org), (len(org)*100)/len(s)) print "MISC : %4d %3d" % (len(misc),(len(misc)*100)/len(s)) print "=============================" print "AMOUNT : %4d 100" % len(s)
def __init__(self, ner_tokens, corpus_tokens, res_name="unknown"): assert ner_tokens and corpus_tokens and res_name self.res_name = res_name print "Calculating statistics .... " corpus_tokens = [t for t in corpus_tokens] self.corpus_defined_entities = len(corpus_tokens) self.intersections = 0 self.exact_match, self._exact_match = 0, [] self.single_entity, self._single_entity = 0, [] self.multiple_entities, self._multiple_entities = 0, [] self.noise_in_entity, self._noise_in_entity = 0, [] self.not_completed_entity, self._not_completed_entity = 0, [] self.exact_match_type_error, self._exact_match_type_error = 0, [] self.exact_match_no_type_error, self._exact_match_no_type_error = 0, [] self.wrong_grouped_entities, self._wrong_grouped_entities= 0, [] ner_tokens_set = TokenSet(ner_tokens) for t in ner_tokens_set.tokens(TokenSet.IntersectedTokens(corpus_tokens)): ner_off1, ner_off2 = t[1], t[1] + t[2] - 1 if len(t.matched) > 1: self.multiple_entities += 1 ss_off1, ss_off2 = t.matched[0][1], t.matched[0][1] + t.matched[0][2] - 1 es_off1, es_off2 = t.matched[-1][1], t.matched[-1][1] + t.matched[-1][2] - 1 if ss_off1 == ner_off1 and es_off2 == ner_off2: l = len(t.matched) - 1 for m in t.matched: l += m[2] if l == t[2]: self.wrong_grouped_entities += 1 else: self.single_entity += 1 c_off1, c_off2 = t.matched[0][1], t.matched[0][1] + t.matched[0][2] - 1 if ner_off1 == c_off1 and ner_off2 == c_off2 : self.exact_match += 1 self._exact_match.append(t) if t.matched[0][3] == t[3]: self.exact_match_no_type_error += 1 self._exact_match_no_type_error.append(t) else: self.exact_match_type_error += 1 self._exact_match_type_error.append(t) elif ner_off1 < c_off1 or ner_off2 > c_off2: self.noise_in_entity += 1 self._noise_in_entity.append(t) elif ner_off1 > c_off1 or ner_off2 < c_off2: self.not_completed_entity += 1 self._not_completed_entity.append(t) self.intersections += 1 print "Step 1 is done" self.not_in_corpus = len([e for e in ner_tokens_set.tokens(NotIntersectedSet(corpus_tokens))]) print "Step 2 is done" self.not_in_ner = len([e for e in TokenSet(corpus_tokens).tokens(NotIntersectedSet(ner_tokens))]) print "Step 3 is done"
def f3(): TokenSet.InInterval(0, -1)
def f1(): TokenSet.InInterval(-1, 52)
def f2(): TokenSet.InInterval(0, 0)
def test_token_set(self): tokens = (["Amsterdam", 0, 9, Token.NE_LOC], ["FIFA", 30, 4, Token.NE_ORG], ["Something", -1, 4, 0], ["Amstel", 73, 6, Token.NE_MISC]) s = TokenSet(tokens) self.assertEqual(s[0], tokens[0]) self.assertEqual(s[1], tokens[1]) self.assertEqual(s[2], tokens[2]) self.assertEqual(s[3], tokens[3]) self.assertEqual(len(s), len(tokens)) def f(): return s[4] self.assertRaises(IndexError, f) def f(): s[0] = ("", 1, 1, 0) self.assertRaises(NotImplementedError, f) self.assertEqual(tokens[0] in s, True) self.assertEqual(tokens[1] in s, True) self.assertEqual(tokens[2] in s, True) self.assertEqual(tokens[3] in s, True) self.assertEqual(("", 2, 2, 2) in s, False) i = 0 for t in s: self.assertEqual(tokens[i], t) i += 1 self.assertEqual(len(tokens), i) i = 0 for t in s.tokens(): self.assertEqual(tokens[i], t) i += 1 self.assertEqual(len(tokens), i) r = [e for e in s.tokens(Token.NE_BITS)] self.assertEqual(len(tokens) - 1, len(r)) r = [e for e in s.tokens(Token.NE_ORG)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[1]) rule = TokenSet.UndefPosition() r = [e for e in s.tokens(rule)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[2]) rule = TokenSet.InInterval(20, 53) r = [e for e in s.tokens(rule)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[1]) rule = TokenSet.InInterval(20, 58) r = [e for e in s.tokens(rule)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[1]) rule = TokenSet.InInterval(20, 59) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[1]) self.assertEqual(r[1], tokens[-1]) def f1(): TokenSet.InInterval(-1, 52) def f2(): TokenSet.InInterval(0, 0) def f3(): TokenSet.InInterval(0, -1) self.assertRaises(AssertionError, f1) self.assertRaises(AssertionError, f2) self.assertRaises(AssertionError, f3) rule = TokenSet.NOT(TokenSet.UndefPosition()) r = [e for e in s.tokens(rule)] self.assertEqual(3, len(r)) self.assertEqual(r[0], tokens[0]) self.assertEqual(r[1], tokens[1]) self.assertEqual(r[2], tokens[3]) rule = TokenSet.OR(TokenSet.Type(Token.NE_ORG), TokenSet.Type(Token.NE_LOC)) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[0]) self.assertEqual(r[1], tokens[1]) rule = TokenSet.AND(TokenSet.InInterval(0, 35), TokenSet.Type(Token.NE_LOC)) r = [e for e in s.tokens(rule)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[0]) rule = TokenSet.AND( TokenSet.InInterval(0, 35), TokenSet.OR(TokenSet.Type(Token.NE_LOC), TokenSet.Type(Token.NE_ORG))) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[0]) self.assertEqual(r[1], tokens[1]) rule = TokenSet.NOT( TokenSet.OR(TokenSet.Type(Token.NE_ORG), TokenSet.Type(Token.NE_LOC))) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[2]) self.assertEqual(r[1], tokens[3]) tokens_to_compare = (["Amsterdam", 0, 9, Token.NE_LOC], ["FIFA", 30, 4, Token.NE_ORG]) rule = TokenSet.EqualTokens(tokens_to_compare) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], Token(tokens[0])) self.assertEqual(r[1], Token(tokens[1])) rule = TokenSet.NOT(TokenSet.EqualTokens(tokens_to_compare)) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[2]) self.assertEqual(r[1], tokens[3]) rule = TokenSet.OR(TokenSet.EqualTokens(tokens_to_compare), TokenSet.UndefPosition()) r = [e for e in s.tokens(rule)] self.assertEqual(3, len(r)) self.assertEqual(r[0], tokens[0]) self.assertEqual(r[1], tokens[1]) self.assertEqual(r[2], tokens[2]) rule = TokenSet.AND(TokenSet.EqualTokens(tokens_to_compare), TokenSet.UndefPosition()) r = [e for e in s.tokens(rule)] self.assertEqual(0, len(r)) tokens_to_compare = (["nsjdjsh", 0, 9, Token.NE_LOC], ["sdsd", 30, 4, Token.NE_ORG], ['dssd', -1, 4, 0]) rule = TokenSet.EqualByPositionTokens(tokens_to_compare) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], Token(tokens[0])) self.assertEqual(r[1], Token(tokens[1])) tokens_to_compare = (["ABC", 0, 72, Token.NE_LOC], ) rule = TokenSet.IntersectedTokens(tokens_to_compare) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], Token(tokens[0])) self.assertEqual(r[1], Token(tokens[1])) tokens_to_compare = (["ABC", 0, 72, Token.NE_LOC], ) rule = TokenSet.NOT(TokenSet.IntersectedTokens(tokens_to_compare)) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[2]) self.assertEqual(r[1], tokens[3]) tokens_to_compare = (["ABC", 33, 41, Token.NE_LOC], ) rule = TokenSet.IntersectedTokens(tokens_to_compare) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[1]) self.assertEqual(r[1], tokens[3])
def validate_corpus_tokens(self, corpus, tokens): assert len(tokens) > 0 ts = TokenSet(corpus.ne_tokens()) r = [e for e in ts.tokens(TokenSet.EqualByPositionTokens(tokens))]
'mustn', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', 'she', 'should', 'shouldn', 'so', 'some', 'such', 't', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'we', 'were', 'weren', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'with', 'won', 'would', 'wouldn', 'you', 'your', 'yours', 'yourself', 'yourselves' ] class SkipStopWords(TokenSet.Match): def __init__(self, stop_words=ENGLISH_STOP_WORDS, mask=re.compile(r"^[a-z]$", re.U | re.I)): assert stop_words self.stop_words, self.mask = stop_words, mask def match(self, t): r = (t[0].lower() in self.stop_words) or (self.mask and self.mask.match(t[0])) return not r if __name__ == "__main__": from gravity.tae.tokenizer import WordTokenizer txt = "Andrei cannot drive a car if he has more than 0.5 pro-mile of alcohol !" for t in TokenSet(WordTokenizer()(txt)).tokens(SkipStopWords()): print t
def validate_corpus_tokens(self, corpus, tokens): assert len(tokens) > 0 ts = TokenSet(corpus.ne_tokens()) r = [ e for e in ts.tokens(TokenSet.EqualByPositionTokens(tokens)) ]
def test_token_set(self): tokens = ( ["Amsterdam", 0, 9, Token.NE_LOC], ["FIFA", 30, 4, Token.NE_ORG], ["Something", -1, 4, 0], ["Amstel", 73, 6, Token.NE_MISC], ) s = TokenSet(tokens) self.assertEqual(s[0], tokens[0]) self.assertEqual(s[1], tokens[1]) self.assertEqual(s[2], tokens[2]) self.assertEqual(s[3], tokens[3]) self.assertEqual(len(s), len(tokens)) def f(): return s[4] self.assertRaises(IndexError, f) def f(): s[0] = ("", 1, 1, 0) self.assertRaises(NotImplementedError, f) self.assertEqual(tokens[0] in s, True) self.assertEqual(tokens[1] in s, True) self.assertEqual(tokens[2] in s, True) self.assertEqual(tokens[3] in s, True) self.assertEqual(("", 2, 2, 2) in s, False) i = 0 for t in s: self.assertEqual(tokens[i], t) i += 1 self.assertEqual(len(tokens), i) i = 0 for t in s.tokens(): self.assertEqual(tokens[i], t) i += 1 self.assertEqual(len(tokens), i) r = [e for e in s.tokens(Token.NE_BITS)] self.assertEqual(len(tokens) - 1, len(r)) r = [e for e in s.tokens(Token.NE_ORG)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[1]) rule = TokenSet.UndefPosition() r = [e for e in s.tokens(rule)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[2]) rule = TokenSet.InInterval(20, 53) r = [e for e in s.tokens(rule)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[1]) rule = TokenSet.InInterval(20, 58) r = [e for e in s.tokens(rule)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[1]) rule = TokenSet.InInterval(20, 59) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[1]) self.assertEqual(r[1], tokens[-1]) def f1(): TokenSet.InInterval(-1, 52) def f2(): TokenSet.InInterval(0, 0) def f3(): TokenSet.InInterval(0, -1) self.assertRaises(AssertionError, f1) self.assertRaises(AssertionError, f2) self.assertRaises(AssertionError, f3) rule = TokenSet.NOT(TokenSet.UndefPosition()) r = [e for e in s.tokens(rule)] self.assertEqual(3, len(r)) self.assertEqual(r[0], tokens[0]) self.assertEqual(r[1], tokens[1]) self.assertEqual(r[2], tokens[3]) rule = TokenSet.OR(TokenSet.Type(Token.NE_ORG), TokenSet.Type(Token.NE_LOC)) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[0]) self.assertEqual(r[1], tokens[1]) rule = TokenSet.AND(TokenSet.InInterval(0, 35), TokenSet.Type(Token.NE_LOC)) r = [e for e in s.tokens(rule)] self.assertEqual(1, len(r)) self.assertEqual(r[0], tokens[0]) rule = TokenSet.AND( TokenSet.InInterval(0, 35), TokenSet.OR(TokenSet.Type(Token.NE_LOC), TokenSet.Type(Token.NE_ORG)) ) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[0]) self.assertEqual(r[1], tokens[1]) rule = TokenSet.NOT(TokenSet.OR(TokenSet.Type(Token.NE_ORG), TokenSet.Type(Token.NE_LOC))) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[2]) self.assertEqual(r[1], tokens[3]) tokens_to_compare = (["Amsterdam", 0, 9, Token.NE_LOC], ["FIFA", 30, 4, Token.NE_ORG]) rule = TokenSet.EqualTokens(tokens_to_compare) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], Token(tokens[0])) self.assertEqual(r[1], Token(tokens[1])) rule = TokenSet.NOT(TokenSet.EqualTokens(tokens_to_compare)) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[2]) self.assertEqual(r[1], tokens[3]) rule = TokenSet.OR(TokenSet.EqualTokens(tokens_to_compare), TokenSet.UndefPosition()) r = [e for e in s.tokens(rule)] self.assertEqual(3, len(r)) self.assertEqual(r[0], tokens[0]) self.assertEqual(r[1], tokens[1]) self.assertEqual(r[2], tokens[2]) rule = TokenSet.AND(TokenSet.EqualTokens(tokens_to_compare), TokenSet.UndefPosition()) r = [e for e in s.tokens(rule)] self.assertEqual(0, len(r)) tokens_to_compare = (["nsjdjsh", 0, 9, Token.NE_LOC], ["sdsd", 30, 4, Token.NE_ORG], ["dssd", -1, 4, 0]) rule = TokenSet.EqualByPositionTokens(tokens_to_compare) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], Token(tokens[0])) self.assertEqual(r[1], Token(tokens[1])) tokens_to_compare = (["ABC", 0, 72, Token.NE_LOC],) rule = TokenSet.IntersectedTokens(tokens_to_compare) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], Token(tokens[0])) self.assertEqual(r[1], Token(tokens[1])) tokens_to_compare = (["ABC", 0, 72, Token.NE_LOC],) rule = TokenSet.NOT(TokenSet.IntersectedTokens(tokens_to_compare)) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[2]) self.assertEqual(r[1], tokens[3]) tokens_to_compare = (["ABC", 33, 41, Token.NE_LOC],) rule = TokenSet.IntersectedTokens(tokens_to_compare) r = [e for e in s.tokens(rule)] self.assertEqual(2, len(r)) self.assertEqual(r[0], tokens[1]) self.assertEqual(r[1], tokens[3])