Пример #1
0
    def test_multiprocess_type_data(self):
        """ENTITY SYMBOLS
        {
          "multi word alias2":[["Q2",5.0],["Q1",3.0],["Q4",2.0]],
          "alias1":[["Q1",10.0],["Q4",6.0]],
          "alias3":[["Q1",30.0]],
          "alias4":[["Q4",20.0],["Q3",15.0],["Q2",1.0]]
        }
        TYPE LABELS
        {
          "Q1": [0, 1],
          "Q2": [2],
          "Q3": [],
          "Q4": [1]
        }
        """
        max_seq_len = 7
        max_aliases = 4
        self.args.data_config.max_aliases = max_aliases
        self.args.data_config.max_seq_len = max_seq_len
        input_data = [
            {
                "aliases": ["alias1", "multi word alias2"],
                "qids": ["Q1", "Q4"],
                "sent_idx_unq": 0,
                "sentence": "alias1 or multi word alias2",
                "spans": [[0, 1], [2, 5]],
                "gold": [True, True],
            },
            {
                "aliases": ["alias1", "multi word alias2"],
                "qids": ["Q1", "Q4"],
                "sent_idx_unq": 1,
                "sentence": "alias1 or multi word alias2",
                "spans": [[0, 1], [2, 5]],
                "gold": [True, True],
            },
        ]
        Y_dict = {
            "gold_type_id":
            torch.tensor([[0 + 1, 1 + 1, -1, -1], [0 + 1, 1 + 1, -1, -1]]),
        }

        utils.write_jsonl(self.temp_file_name, input_data)
        use_weak_label = True

        dataset = BootlegDataset(
            self.args,
            name="Bootleg_test",
            dataset=self.temp_file_name,
            use_weak_label=use_weak_label,
            tokenizer=self.tokenizer,
            entity_symbols=self.entity_symbols,
            dataset_threads=2,
            split="train",
            is_bert=True,
        )
        # As the gold is the same for all, the order won't matter
        assert torch.equal(Y_dict["gold_type_id"],
                           dataset.Y_dict["gold_type_id"])
Пример #2
0
    def test_subsent_data(self):
        """ENTITY SYMBOLS
        {
          "multi word alias2":[["Q2",5.0],["Q1",3.0],["Q4",2.0]],
          "alias1":[["Q1",10.0],["Q4",6.0]],
          "alias3":[["Q1",30.0]],
          "alias4":[["Q4",20.0],["Q3",15.0],["Q2",1.0]]
        }
        TYPE LABELS
        {
          "Q1": [0, 1],
          "Q2": [2],
          "Q3": [],
          "Q4": [1]
        }
        """
        # Test 1: the sentence is long and has far apart aliases so it gets split up into two subsentences;
        # the types should follow
        max_seq_len = 7
        max_aliases = 4
        self.args.data_config.max_aliases = max_aliases
        self.args.data_config.max_seq_len = max_seq_len
        input_data = [{
            "aliases": ["alias3", "alias4"],
            "qids": ["Q1", "Q2"],
            "sent_idx_unq": 0,
            "sentence": "alias3 cat cat cat cat cat cat alias4",
            "spans": [[0, 1], [7, 8]],
            "gold": [True, True],
        }]
        Y_dict = {
            "gold_type_id":
            torch.tensor([[0 + 1, -1, -1, -1], [2 + 1, -1, -1, -1]]),
        }

        utils.write_jsonl(self.temp_file_name, input_data)
        use_weak_label = True

        dataset = BootlegDataset(
            self.args,
            name="Bootleg_test",
            dataset=self.temp_file_name,
            use_weak_label=use_weak_label,
            tokenizer=self.tokenizer,
            entity_symbols=self.entity_symbols,
            dataset_threads=1,
            split="train",
            is_bert=True,
        )
        assert torch.equal(Y_dict["gold_type_id"],
                           dataset.Y_dict["gold_type_id"])
Пример #3
0
    def test_masked_aliases(self):
        """ENTITY SYMBOLS
        {
          "multi word alias2":[["Q2",5.0],["Q1",3.0],["Q4",2.0]],
          "alias1":[["Q1",10.0],["Q4",6.0]],
          "alias3":[["Q1",30.0]],
          "alias4":[["Q4",20.0],["Q3",15.0],["Q2",1.0]]
        }
        TYPE LABELS
        {
          "Q1": [0, 1],
          "Q2": [2],
          "Q3": [],
          "Q4": [1]
        }
        """
        # Test 1: this sentence gets split into two with two aliases each
        # (the first alias of second split masked out). The types should follow this trend. Since split is train,
        # they also ignore the gold.
        max_seq_len = 7
        max_aliases = 2
        self.args.data_config.max_aliases = max_aliases
        self.args.data_config.max_seq_len = max_seq_len
        input_data = [{
            "aliases": ["alias3", "alias4", "alias3"],
            "qids": ["Q1", "Q4", "Q1"],
            "sent_idx_unq": 0,
            "sentence": "alias3 alias4 alias3",
            "spans": [[0, 1], [1, 2], [2, 3]],
            "gold": [True, False, False],
        }]
        Y_dict = {
            "gold_type_id": torch.tensor([[0 + 1, 1 + 1], [-1, 0 + 1]]),
        }
        utils.write_jsonl(self.temp_file_name, input_data)
        use_weak_label = True

        dataset = BootlegDataset(
            self.args,
            name="Bootleg_test",
            dataset=self.temp_file_name,
            use_weak_label=use_weak_label,
            tokenizer=self.tokenizer,
            entity_symbols=self.entity_symbols,
            dataset_threads=1,
            split="train",
            is_bert=True,
        )
        assert torch.equal(Y_dict["gold_type_id"],
                           dataset.Y_dict["gold_type_id"])

        # Test 2: with the split of "dev", the subsentences should remain unchanged but the
        # true index in Y_dict should be -1
        max_seq_len = 7
        max_aliases = 2
        split = "dev"
        self.args.data_config.max_aliases = max_aliases
        self.args.data_config.max_seq_len = max_seq_len
        input_data = [{
            "aliases": ["alias3", "alias4", "alias3"],
            "qids": ["Q1", "Q4", "Q1"],
            "sent_idx_unq": 0,
            "sentence": "alias3 alias4 alias3",
            "spans": [[0, 1], [1, 2], [2, 3]],
            "gold": [True, False, False],
        }]
        Y_dict = {
            "gold_type_id": torch.tensor([[0 + 1, -1], [-1, -1]]),
        }

        utils.write_jsonl(self.temp_file_name, input_data)
        use_weak_label = True

        dataset = BootlegDataset(
            self.args,
            name="Bootleg_test",
            dataset=self.temp_file_name,
            use_weak_label=use_weak_label,
            tokenizer=self.tokenizer,
            entity_symbols=self.entity_symbols,
            dataset_threads=1,
            split=split,
            is_bert=True,
        )
        assert torch.equal(Y_dict["gold_type_id"],
                           dataset.Y_dict["gold_type_id"])