def calc_types_distribution_for_completly_wrongly_recognized_entities(ner_id, lang = 'nl', model = None): class NotMatchLocationSet(TokenSet.MatchSet): def __init__(self, tokens): super(self.__class__, self).__init__(tokens, False) def match_tokens(self, token1, token2): return token1[1] >= 0 and token2[1] >= 0 and (token1[1] != token2[1] or token1[2] != token2[2]) a = load_all_recognized_tokens(ner_id, lang, model) r = load_all_matched_tokens(ner_id, lang, model) nm = TokenSet.NotMatchSet(TokenSet.MatchLocationSet(r)) s = TokenSet(TokenSet(a).tokens(nm)) misc = s.tokens(Token.NE_MISC) loc = s.tokens(Token.NE_LOC) per = s.tokens(Token.NE_PER) org = s.tokens(Token.NE_ORG) print "======== %s Recognized entities type distribution :" % ner_id print "LOCATIONS : %4d %3d" % (len(loc), (len(loc)*100)/len(s)) print "PERSONS : %4d %3d" % (len(per), (len(per)*100)/len(s)) print "ORGANIZATION : %4d %3d" % (len(org), (len(org)*100)/len(s)) print "MISC : %4d %3d" % (len(misc),(len(misc)*100)/len(s)) print "=============================" print "AMOUNT : %4d 100" % len(s)