def get_mids_by_surface(): surface = request.args.get('surface').strip() print '[get_mid_by_surface]' mids = DBManager.get_candidate_entities(surface, 0.1) print mids res = { 'candidates': '<br>'.join('%s %s' % (m[0], m[1]) for m in mids) } return json.dumps(res)
def gen_unsolved_sentence(fn_in, fn_out): avg_candidate = 0 num = 0 with open(fn_in) as fin, open(fn_out, 'w') as fout: for line in fin: data = json.loads(line, encoding='utf8') gold_entity = data['entity'] surfaces = data['predict'].split("\t") candidates = dict() for surface in surfaces: surface = surface.lower().replace(' ', '') res = DBManager.get_candidate_entities(surface, 0.1) for e in res: if e[0] not in candidates or e[1] > candidates[e[0]]: candidates[e[0]] = e[1] if len(candidates) == 0: sentence = [w.split('|')[0] for w in data['tag_res'].split()][1:-1] if 'pos' in data: all_pos = data['pos'][1:-1] else: all_pos = None # use ngram of surface for surface in surfaces: surface = surface.lower().split() if len(surface) == 0: continue start = find_word(sentence, surface) if start == -1: continue l = len(surface) found = False for j in range(l, 0, -1): # if found: # break for i in range(l - j + 1): if 'pos' not in data or is_entity_occurrence( all_pos, sentence, start + i, start + i + j): s = ''.join(surface[i:i + j]) res = DBManager.get_candidate_entities(s, 0.1) for e in res: if e[1] < 1.1 and ( e[0] not in candidates or e[1] > candidates[e[0]]): candidates[e[0]] = e[1] found = len(res) > 0 # candidates = sorted(candidates.items(), key=lambda x:x[1], reverse=True)[:20] candidates = candidates.items() correct = False for e, _ in candidates: if e == gold_entity: avg_candidate += len(candidates) num += 1 correct = True break print >> fout, ("%s\t%s" % (gold_entity, ' '.join( [c for c, _ in candidates]))).encode('utf8') if not correct: # print >> fout, line.strip(), candidates print surfaces, data['gold'].split('\t'), gold_entity # else: # print line.strip() # print candidates print "%s find correct topic entity" % num print "average number of candidate entities: %s" % (avg_candidate * 1.0 / num)