def blink_process(set_to_calculate): # doc, men, left, right, candidates global feature_list try: data_to_link = [{ "id": i, "label": "unknown", "label_id": -1, "context_left": entry[2].lower(), "mention": entry[1].lower(), "context_right": entry[3].lower(), } for i, entry in enumerate(set_to_calculate)] print(len(data_to_link)) _, _, _, _, _, predictions, scores, = main_dense.run( args, None, *models, test_data=data_to_link) # scores = scipy.special.softmax(scores) # predictions = {i: {pred: scores[i][j] for j, pred in enumerate(prediction)} for i, prediction in enumerate(predictions)} # for i, entry in # print(predictions) except KeyboardInterrupt: raise KeyboardInterrupt except: import traceback traceback.print_exc() with open(logname, 'a') as f: f.write(f'{set_to_calculate}\n')
def generate_response(self, text, spans): """ Generates response for API. Can be either ED only or EL, meaning end-to-end. :return: list of tuples for each entity found. """ if len(text) == 0: return [] if len(spans) > 0: # ED. processed = {API_DOC: [text, spans]} mentions_dataset, total_ment = self.mention_detection.format_spans( processed) else: # EL processed = {API_DOC: [text, spans]} mentions_dataset, total_ment = self.mention_detection.find_mentions( processed, self.tagger_ner) # Create to-be-linked dataset. data_to_link = [] temp_m = mentions_dataset[API_DOC] for i, m in enumerate(temp_m): # Using ngram, which is basically the original mention (without preprocessing as in BLINK's code). temp = { "id": i, "label": "unknown", "label_id": -1, "context_left": m["context"][0].lower(), "mention": m["ngram"].lower(), "context_right": m["context"][1].lower(), } data_to_link.append(temp) _, _, _, _, _, predictions, scores, = main_dense.run( self.argss, self.logger, *self.model, test_data=data_to_link) predictions = { API_DOC: [{ "prediction": x[0].replace(" ", "_") } for x in predictions] } # Process result. result = process_results( mentions_dataset, predictions, processed, include_offset=False if ((len(spans) > 0) or self.custom_ner) else True, ) # Singular document. if len(result) > 0: return [*result.values()][0] return []
def predict_blink(self, winfo_list): assert self.models is not None # return (biencoder_accuracy, recall_at, crossencoder_normalized_accuracy, overall_unormalized_accuracy, # len(winfo_list), predictions, scores) all list of length(winfo_list) ret = main_dense.run(self.blink_config, self.blink_logger, *self.models, test_data=winfo_list) pred_list, score_list = ret[5], ret[6] # Load the returned data back into the wiki info dictionaries assert len(winfo_list) == len(pred_list) == len(score_list) for winfo, preds, scores in zip(winfo_list, pred_list, score_list): preds = [p for p in preds if not p.startswith('List of')] prediction = '"%s"' % unidecode(preds[0]) if preds else '-' prediction = prediction.replace(' ', '_') winfo['wiki_val'] = prediction winfo['score'] = float(scores[0]) return winfo_list
"crossencoder normalized accuracy", "overall unormalized accuracy", "support", ]) for dataset in DATASETS: logger.info(dataset["name"]) PARAMETERS["test_mentions"] = dataset["filename"] args = argparse.Namespace(**PARAMETERS) ( biencoder_accuracy, recall_at, crossencoder_normalized_accuracy, overall_unormalized_accuracy, num_datapoints, predictions, scores, ) = main_dense.run(args, logger, *models) table.add_row([ dataset["name"], round(biencoder_accuracy, 4), round(recall_at, 4), round(crossencoder_normalized_accuracy, 4), round(overall_unormalized_accuracy, 4), num_datapoints, ]) logger.info("\n{}".format(table))
args = argparse.Namespace(**config) models = main_dense.load_models(args, logger=None) data_to_link = [ { "id": 0, "label": "unknown", "label_id": -1, "context_left": "".lower(), "mention": "Shakespeare".lower(), "context_right": "'s account of the Roman general Julius Caesar's murder by his friend Brutus is a meditation on duty.".lower(), }, # { # "id": 1, # "label": "unknown", # "label_id": -1, # "context_left": "Shakespeare's account of the Roman general".lower(), # "mention": "Julius Caesar".lower(), # "context_right": "'s murder by his friend Brutus is a meditation on duty.".lower(), # } ] _, _, _, _, _, predictions, scores, = main_dense.run(args, None, *models, test_data=data_to_link) scores = scipy.special.softmax(scores) print(predictions, scores) print(len(predictions[0])) print(len(predictions[1])) print('Done')
def run(self): ( biencoder_accuracy, recall_at, crossencoder_normalized_accuracy, overall_unormalized_accuracy, num_datapoints, predictions, scores, ) = main_dense.run(self.args, self.logger, *self.models, test_data=self.test_data) # aggregate multiple records for the same datapoint print("aggregate multiple records for the same datapoint", flush=True) id_2_results = {} for r, p, s in zip(self.test_data, predictions, scores): if r["id"] not in id_2_results: id_2_results[r["id"]] = {"predictions": [], "scores": []} id_2_results[r["id"]]["predictions"].extend(p) id_2_results[r["id"]]["scores"].extend(s) all_doc_id = [] all_query_id = [] all_scores = [] provenance = {} for id, results in id_2_results.items(): element = {"id": str(id), "retrieved": []} # merge predictions when multiple entities are found sorted_titles = [] sorted_scores = [] for y, x in sorted(zip(results["scores"], results["predictions"]), reverse=True): if x not in sorted_titles: sorted_titles.append(x) sorted_scores.append(y) local_doc_id = [] for e_title, score in zip(sorted_titles, sorted_scores): if e_title not in self.Wikipedia_title2id: print("WARNING: title: {} not recognized".format(e_title), flush=True) else: """ if e_title in self.cache_pages: page = self.cache_pages[e_title] else: page = self.ks.get_page_by_title(e_title) self.cache_pages[e_title] = page wikipedia_id = page["wikipedia_id"] """ wikipedia_id = self.Wikipedia_title2id[e_title] local_doc_id.append(wikipedia_id) element["retrieved"].append({ "score": str(score), # "text": page["text"], "wikipedia_title": str(e_title), "wikipedia_id": str(wikipedia_id), }) all_doc_id.append(local_doc_id) all_scores.append(sorted_scores) all_query_id.append(id) provenance[id] = element["retrieved"] return all_doc_id, all_scores, all_query_id, provenance
"entity_encoding": models_path + "all_entities_large.t7", "crossencoder_model": models_path + "crossencoder_wiki_large.bin", "crossencoder_config": models_path + "crossencoder_wiki_large.json", "top_k": 10, "show_url": False, "fast": args.fast, # set this to be true if speed is a concern "output_path": models_path + "logs/", # logging directory "faiss_index": None, #"flat", "index_path": models_path + "faiss_flat_index.pkl", } args_blink = argparse.Namespace(**config) models = main_dense.load_models(args_blink, logger=logger) _, _, _, _, _, predictions, scores, = main_dense.run(args_blink, logger, *models, test_data=for_blink, device=args.device) for s, pp in zip(for_blink, predictions): pp = [p for p in pp if not p.startswith('List of')] p = f'"{pp[0]}"' if pp else '-' p = p.replace(' ', '_') graph_n = s['graph_n'] triple_n = s['triple_n'] triples = [g for g in graphs[graph_n].triples] n, rel, w = triples[triple_n] triples[triple_n] = Triple(n, rel, p) g = Graph(triples) g.metadata = graphs[graph_n].metadata graphs[graph_n] = g
"context_left": "".lower(), "mention": "Shakespeare".lower(), "context_right": "'s account of the Roman general Julius Caesar's murder by his friend Brutus is a meditation on duty.".lower(), }, { "id": 1, "label": "unknown", "label_id": -1, "context_left": "Shakespeare's account of the Roman general".lower(), "mention": "Julius Caesar".lower(), "context_right": "'s murder by his friend Brutus is a meditation on duty.".lower(), }, ] # TODO: Add ED ( _, _, _, _, _, predictions, scores, ) = main_dense.run(args, logger, *models, test_data=data_to_link) print("============") print(predictions) print("---") print(scores)