Пример #1
0
def blink_process(set_to_calculate):
    # doc, men, left, right, candidates
    global feature_list
    try:
        data_to_link = [{
            "id": i,
            "label": "unknown",
            "label_id": -1,
            "context_left": entry[2].lower(),
            "mention": entry[1].lower(),
            "context_right": entry[3].lower(),
        } for i, entry in enumerate(set_to_calculate)]
        print(len(data_to_link))
        _, _, _, _, _, predictions, scores, = main_dense.run(
            args, None, *models, test_data=data_to_link)
        # scores = scipy.special.softmax(scores)
        # predictions = {i: {pred: scores[i][j] for j, pred in enumerate(prediction)} for i, prediction in enumerate(predictions)}
        # for i, entry in
        # print(predictions)

    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except:
        import traceback
        traceback.print_exc()
        with open(logname, 'a') as f:
            f.write(f'{set_to_calculate}\n')
Пример #2
0
        def generate_response(self, text, spans):
            """
            Generates response for API. Can be either ED only or EL, meaning end-to-end.

            :return: list of tuples for each entity found.
            """

            if len(text) == 0:
                return []

            if len(spans) > 0:
                # ED.
                processed = {API_DOC: [text, spans]}
                mentions_dataset, total_ment = self.mention_detection.format_spans(
                    processed)
            else:
                # EL
                processed = {API_DOC: [text, spans]}
                mentions_dataset, total_ment = self.mention_detection.find_mentions(
                    processed, self.tagger_ner)

            # Create to-be-linked dataset.
            data_to_link = []
            temp_m = mentions_dataset[API_DOC]
            for i, m in enumerate(temp_m):
                # Using ngram, which is basically the original mention (without preprocessing as in BLINK's code).
                temp = {
                    "id": i,
                    "label": "unknown",
                    "label_id": -1,
                    "context_left": m["context"][0].lower(),
                    "mention": m["ngram"].lower(),
                    "context_right": m["context"][1].lower(),
                }
                data_to_link.append(temp)
            _, _, _, _, _, predictions, scores, = main_dense.run(
                self.argss, self.logger, *self.model, test_data=data_to_link)

            predictions = {
                API_DOC: [{
                    "prediction": x[0].replace(" ", "_")
                } for x in predictions]
            }
            # Process result.
            result = process_results(
                mentions_dataset,
                predictions,
                processed,
                include_offset=False if
                ((len(spans) > 0) or self.custom_ner) else True,
            )

            # Singular document.
            if len(result) > 0:
                return [*result.values()][0]

            return []
Пример #3
0
 def predict_blink(self, winfo_list):
     assert self.models is not None
     # return (biencoder_accuracy, recall_at, crossencoder_normalized_accuracy, overall_unormalized_accuracy,
     #         len(winfo_list), predictions, scores) all list of length(winfo_list)
     ret = main_dense.run(self.blink_config, self.blink_logger, *self.models, test_data=winfo_list)
     pred_list, score_list = ret[5], ret[6]
     # Load the returned data back into the wiki info dictionaries
     assert len(winfo_list) == len(pred_list) == len(score_list)
     for winfo, preds, scores in zip(winfo_list, pred_list, score_list):
         preds      = [p for p in preds if not p.startswith('List of')]
         prediction = '"%s"' % unidecode(preds[0]) if preds else '-'
         prediction = prediction.replace(' ', '_')
         winfo['wiki_val'] = prediction
         winfo['score']    = float(scores[0])
     return winfo_list
Пример #4
0
    "crossencoder normalized accuracy",
    "overall unormalized accuracy",
    "support",
])

for dataset in DATASETS:
    logger.info(dataset["name"])
    PARAMETERS["test_mentions"] = dataset["filename"]

    args = argparse.Namespace(**PARAMETERS)
    (
        biencoder_accuracy,
        recall_at,
        crossencoder_normalized_accuracy,
        overall_unormalized_accuracy,
        num_datapoints,
        predictions,
        scores,
    ) = main_dense.run(args, logger, *models)

    table.add_row([
        dataset["name"],
        round(biencoder_accuracy, 4),
        round(recall_at, 4),
        round(crossencoder_normalized_accuracy, 4),
        round(overall_unormalized_accuracy, 4),
        num_datapoints,
    ])

logger.info("\n{}".format(table))
Пример #5
0
args = argparse.Namespace(**config)

models = main_dense.load_models(args, logger=None)

data_to_link = [ {
                    "id": 0,
                    "label": "unknown",
                    "label_id": -1,
                    "context_left": "".lower(),
                    "mention": "Shakespeare".lower(),
                    "context_right": "'s account of the Roman general Julius Caesar's murder by his friend Brutus is a meditation on duty.".lower(),
                },
                # {
                #     "id": 1,
                #     "label": "unknown",
                #     "label_id": -1,
                #     "context_left": "Shakespeare's account of the Roman general".lower(),
                #     "mention": "Julius Caesar".lower(),
                #     "context_right": "'s murder by his friend Brutus is a meditation on duty.".lower(),
                # }
                ]

_, _, _, _, _, predictions, scores, = main_dense.run(args, None, *models, test_data=data_to_link)
scores = scipy.special.softmax(scores)
print(predictions, scores)
print(len(predictions[0]))
print(len(predictions[1]))
print('Done')


Пример #6
0
    def run(self):
        (
            biencoder_accuracy,
            recall_at,
            crossencoder_normalized_accuracy,
            overall_unormalized_accuracy,
            num_datapoints,
            predictions,
            scores,
        ) = main_dense.run(self.args,
                           self.logger,
                           *self.models,
                           test_data=self.test_data)

        # aggregate multiple records for the same datapoint
        print("aggregate multiple records for the same datapoint", flush=True)
        id_2_results = {}
        for r, p, s in zip(self.test_data, predictions, scores):

            if r["id"] not in id_2_results:
                id_2_results[r["id"]] = {"predictions": [], "scores": []}
            id_2_results[r["id"]]["predictions"].extend(p)
            id_2_results[r["id"]]["scores"].extend(s)

        all_doc_id = []
        all_query_id = []
        all_scores = []

        provenance = {}

        for id, results in id_2_results.items():

            element = {"id": str(id), "retrieved": []}

            # merge predictions when multiple entities are found
            sorted_titles = []
            sorted_scores = []
            for y, x in sorted(zip(results["scores"], results["predictions"]),
                               reverse=True):
                if x not in sorted_titles:
                    sorted_titles.append(x)
                    sorted_scores.append(y)

            local_doc_id = []
            for e_title, score in zip(sorted_titles, sorted_scores):

                if e_title not in self.Wikipedia_title2id:
                    print("WARNING: title: {} not recognized".format(e_title),
                          flush=True)
                else:
                    """
                    if e_title in self.cache_pages:
                        page = self.cache_pages[e_title]
                    else:
                        page = self.ks.get_page_by_title(e_title)
                        self.cache_pages[e_title] = page
                    
                    wikipedia_id = page["wikipedia_id"]
                    """

                    wikipedia_id = self.Wikipedia_title2id[e_title]

                    local_doc_id.append(wikipedia_id)

                    element["retrieved"].append({
                        "score":
                        str(score),
                        # "text": page["text"],
                        "wikipedia_title":
                        str(e_title),
                        "wikipedia_id":
                        str(wikipedia_id),
                    })
            all_doc_id.append(local_doc_id)
            all_scores.append(sorted_scores)
            all_query_id.append(id)
            provenance[id] = element["retrieved"]

        return all_doc_id, all_scores, all_query_id, provenance
Пример #7
0
        "entity_encoding": models_path + "all_entities_large.t7",
        "crossencoder_model": models_path + "crossencoder_wiki_large.bin",
        "crossencoder_config": models_path + "crossencoder_wiki_large.json",
        "top_k": 10,
        "show_url": False,
        "fast": args.fast,  # set this to be true if speed is a concern
        "output_path": models_path + "logs/",  # logging directory
        "faiss_index": None,  #"flat",
        "index_path": models_path + "faiss_flat_index.pkl",
    }

    args_blink = argparse.Namespace(**config)
    models = main_dense.load_models(args_blink, logger=logger)
    _, _, _, _, _, predictions, scores, = main_dense.run(args_blink,
                                                         logger,
                                                         *models,
                                                         test_data=for_blink,
                                                         device=args.device)

    for s, pp in zip(for_blink, predictions):
        pp = [p for p in pp if not p.startswith('List of')]
        p = f'"{pp[0]}"' if pp else '-'
        p = p.replace(' ', '_')
        graph_n = s['graph_n']
        triple_n = s['triple_n']
        triples = [g for g in graphs[graph_n].triples]
        n, rel, w = triples[triple_n]
        triples[triple_n] = Triple(n, rel, p)
        g = Graph(triples)
        g.metadata = graphs[graph_n].metadata
        graphs[graph_n] = g
Пример #8
0
        "context_left": "".lower(),
        "mention": "Shakespeare".lower(),
        "context_right": "'s account of the Roman general Julius Caesar's murder by his friend Brutus is a meditation on duty.".lower(),
    },
    {
        "id": 1,
        "label": "unknown",
        "label_id": -1,
        "context_left": "Shakespeare's account of the Roman general".lower(),
        "mention": "Julius Caesar".lower(),
        "context_right": "'s murder by his friend Brutus is a meditation on duty.".lower(),
    },
]

# TODO: Add ED
(
    _,
    _,
    _,
    _,
    _,
    predictions,
    scores,
) = main_dense.run(args, logger, *models, test_data=data_to_link)


print("============")
print(predictions)
print("---")
print(scores)