Пример #1
0
    def test_merge_subsentences(self):

        test_full_emb_file = tempfile.NamedTemporaryFile()
        test_merged_emb_file = tempfile.NamedTemporaryFile()
        gold_merged_emb_file = tempfile.NamedTemporaryFile()

        num_examples = 3
        total_num_mentions = 7
        M = 3
        K = 2
        hidden_size = 2

        # create full embedding file
        storage_type_full = np.dtype([('M', int), ('K', int), ('hidden_size', int), ('sent_idx', int), ('subsent_idx', int),
            ('alias_list_pos', int, M), ('entity_emb', float, M*hidden_size),
            ('final_loss_true', int, M), ('final_loss_pred', int, M),
            ('final_loss_prob', float, M), ('final_loss_cand_probs', float, M*K)])
        full_emb = np.memmap(test_full_emb_file.name, dtype=storage_type_full, mode='w+', shape=(num_examples,))

        # 2 sentences, 1st sent has 1 subsentence, 2nd sentence has 2 subsentences
        # first sentence
        full_emb['hidden_size'] = hidden_size
        full_emb['M'] = M
        full_emb['K'] = K
        full_emb[0]['sent_idx'] = 0
        full_emb[0]['subsent_idx'] = 0
        # last alias is padded
        full_emb[0]['alias_list_pos'] = np.array([0, 1, -1])
        full_emb[0]['final_loss_true'] = np.array([0, 1, -1])
        # entity embs are flattened
        full_emb[0]['entity_emb'] = np.array([0, 1, 2, 3, 0, 0])

        full_emb[1]['sent_idx'] = 1
        full_emb[1]['subsent_idx'] = 0
        full_emb[1]['alias_list_pos'] = np.array([0, 1, 2])
        # last alias goes with next subsentence
        full_emb[1]['final_loss_true'] = np.array([1, 1, -1])
        full_emb[1]['entity_emb'] = np.array([4, 5, 6, 7, 8, 9])

        full_emb[2]['sent_idx'] = 1
        full_emb[2]['subsent_idx'] = 1
        full_emb[2]['alias_list_pos'] = np.array([2, 3, 4])
        full_emb[2]['final_loss_true'] = np.array([1, 1, 1])
        full_emb[2]['entity_emb'] = np.array([10, 11, 12, 13, 14, 15])

        # create merged embedding file
        storage_type_merged = np.dtype([('hidden_size', int),
                             ('sent_idx', int),
                             ('alias_list_pos', int),
                             ('entity_emb', float, hidden_size),
                             ('final_loss_pred', int),
                             ('final_loss_prob', float),
                             ('final_loss_cand_probs', float, K)])
        merged_emb_gold = np.memmap(gold_merged_emb_file.name, dtype=storage_type_merged, mode="w+", shape=(total_num_mentions,))
        merged_emb_gold['entity_emb'] = np.array([[0, 1],
                                             [2, 3],
                                             [4, 5],
                                             [6, 7],
                                             [10, 11],
                                             [12, 13],
                                             [14, 15]])

        # create data file -- just needs aliases and sentence indices
        data = [{'aliases': ['a', 'b'], 'sent_idx_unq': 0},
                {'aliases': ['c', 'd', 'e', 'f', 'g'], 'sent_idx_unq': 1}]

        temp_file = tempfile.NamedTemporaryFile(delete=False).name
        with jsonlines.open(temp_file, 'w') as f:
            for row in data:
                f.write(row)

        # assert that output of merge_subsentences is correct
        num_processes = 2
        eval_utils.merge_subsentences(
            num_processes,
            temp_file,
            test_merged_emb_file.name,
            storage_type_merged,
            test_full_emb_file.name,
            storage_type_full,
            dump_embs=True)
        bootleg_merged_emb = np.memmap(test_merged_emb_file.name, dtype=storage_type_merged, mode="r+")
        merged_emb_gold = np.memmap(gold_merged_emb_file.name, dtype=storage_type_merged, mode="r+")
        assert len(bootleg_merged_emb) == total_num_mentions
        for i in range(len(bootleg_merged_emb)):
            assert np.array_equal(bootleg_merged_emb[i]['entity_emb'], merged_emb_gold[i]['entity_emb'])

        # clean up
        if os.path.exists(temp_file):
            os.remove(temp_file)
        test_full_emb_file.close()
        test_merged_emb_file.close()
        gold_merged_emb_file.close()
Пример #2
0
    def test_merge_subsentences(self):

        test_full_emb_file = tempfile.NamedTemporaryFile()
        test_merged_emb_file = tempfile.NamedTemporaryFile()
        gold_merged_emb_file = tempfile.NamedTemporaryFile()
        cache_folder = tempfile.TemporaryDirectory()

        num_examples = 3
        total_num_mentions = 7
        M = 3
        K = 2
        hidden_size = 2

        # create full embedding file
        storage_type_full = np.dtype([
            ("M", int),
            ("K", int),
            ("hidden_size", int),
            ("sent_idx", int),
            ("subsent_idx", int),
            ("alias_list_pos", int, M),
            ("entity_emb", float, M * hidden_size),
            ("final_loss_true", int, M),
            ("final_loss_pred", int, M),
            ("final_loss_prob", float, M),
            ("final_loss_cand_probs", float, M * K),
        ])
        full_emb = np.memmap(
            test_full_emb_file.name,
            dtype=storage_type_full,
            mode="w+",
            shape=(num_examples, ),
        )

        # 2 sentences, 1st sent has 1 subsentence, 2nd sentence has 2 subsentences
        # first sentence
        full_emb["hidden_size"] = hidden_size
        full_emb["M"] = M
        full_emb["K"] = K
        full_emb[0]["sent_idx"] = 0
        full_emb[0]["subsent_idx"] = 0
        # last alias is padded
        full_emb[0]["alias_list_pos"] = np.array([0, 1, -1])
        full_emb[0]["final_loss_true"] = np.array([0, 1, -1])
        # entity embs are flattened
        full_emb[0]["entity_emb"] = np.array([0, 1, 2, 3, 0, 0])

        full_emb[1]["sent_idx"] = 1
        full_emb[1]["subsent_idx"] = 0
        full_emb[1]["alias_list_pos"] = np.array([0, 1, 2])
        # last alias goes with next subsentence
        full_emb[1]["final_loss_true"] = np.array([1, 1, -1])
        full_emb[1]["entity_emb"] = np.array([4, 5, 6, 7, 8, 9])

        full_emb[2]["sent_idx"] = 1
        full_emb[2]["subsent_idx"] = 1
        full_emb[2]["alias_list_pos"] = np.array([2, 3, 4])
        full_emb[2]["final_loss_true"] = np.array([1, 1, 1])
        full_emb[2]["entity_emb"] = np.array([10, 11, 12, 13, 14, 15])

        # create merged embedding file
        storage_type_merged = np.dtype([
            ("hidden_size", int),
            ("sent_idx", int),
            ("alias_list_pos", int),
            ("entity_emb", float, hidden_size),
            ("final_loss_pred", int),
            ("final_loss_prob", float),
            ("final_loss_cand_probs", float, K),
        ])
        merged_emb_gold = np.memmap(
            gold_merged_emb_file.name,
            dtype=storage_type_merged,
            mode="w+",
            shape=(total_num_mentions, ),
        )
        merged_emb_gold["entity_emb"] = np.array([[0, 1], [2, 3], [4, 5],
                                                  [6, 7], [10, 11], [12, 13],
                                                  [14, 15]])

        # create data file -- just needs aliases and sentence indices
        data = [
            {
                "aliases": ["a", "b"],
                "sent_idx_unq": 0
            },
            {
                "aliases": ["c", "d", "e", "f", "g"],
                "sent_idx_unq": 1
            },
        ]
        # Keys are string for trie
        sent_idx2num_mentions = {"0": 2, "1": 5}
        temp_file = tempfile.NamedTemporaryFile(delete=False).name
        with jsonlines.open(temp_file, "w") as f:
            for row in data:
                f.write(row)

        # assert that output of merge_subsentences is correct
        num_processes = 1

        eval_utils.merge_subsentences(
            num_processes,
            sent_idx2num_mentions,
            cache_folder.name,
            test_merged_emb_file.name,
            storage_type_merged,
            test_full_emb_file.name,
            storage_type_full,
            dump_embs=True,
        )
        bootleg_merged_emb = np.memmap(test_merged_emb_file.name,
                                       dtype=storage_type_merged,
                                       mode="r+")
        merged_emb_gold = np.memmap(gold_merged_emb_file.name,
                                    dtype=storage_type_merged,
                                    mode="r+")
        assert len(bootleg_merged_emb) == total_num_mentions
        for i in range(len(bootleg_merged_emb)):
            assert np.array_equal(bootleg_merged_emb[i]["entity_emb"],
                                  merged_emb_gold[i]["entity_emb"])

        # Try with multiprocessing
        num_processes = 5
        eval_utils.merge_subsentences(
            num_processes,
            sent_idx2num_mentions,
            cache_folder.name,
            test_merged_emb_file.name,
            storage_type_merged,
            test_full_emb_file.name,
            storage_type_full,
            dump_embs=True,
        )
        bootleg_merged_emb = np.memmap(test_merged_emb_file.name,
                                       dtype=storage_type_merged,
                                       mode="r+")
        merged_emb_gold = np.memmap(gold_merged_emb_file.name,
                                    dtype=storage_type_merged,
                                    mode="r+")
        assert len(bootleg_merged_emb) == total_num_mentions
        for i in range(len(bootleg_merged_emb)):
            assert np.array_equal(bootleg_merged_emb[i]["entity_emb"],
                                  merged_emb_gold[i]["entity_emb"])

        # clean up
        if os.path.exists(temp_file):
            os.remove(temp_file)
        test_full_emb_file.close()
        test_merged_emb_file.close()
        gold_merged_emb_file.close()
        cache_folder.cleanup()