def test_multiprocess_type_data(self): """ENTITY SYMBOLS { "multi word alias2":[["Q2",5.0],["Q1",3.0],["Q4",2.0]], "alias1":[["Q1",10.0],["Q4",6.0]], "alias3":[["Q1",30.0]], "alias4":[["Q4",20.0],["Q3",15.0],["Q2",1.0]] } TYPE LABELS { "Q1": [0, 1], "Q2": [2], "Q3": [], "Q4": [1] } """ max_seq_len = 7 max_aliases = 4 self.args.data_config.max_aliases = max_aliases self.args.data_config.max_seq_len = max_seq_len input_data = [ { "aliases": ["alias1", "multi word alias2"], "qids": ["Q1", "Q4"], "sent_idx_unq": 0, "sentence": "alias1 or multi word alias2", "spans": [[0, 1], [2, 5]], "gold": [True, True], }, { "aliases": ["alias1", "multi word alias2"], "qids": ["Q1", "Q4"], "sent_idx_unq": 1, "sentence": "alias1 or multi word alias2", "spans": [[0, 1], [2, 5]], "gold": [True, True], }, ] Y_dict = { "gold_type_id": torch.tensor([[0 + 1, 1 + 1, -1, -1], [0 + 1, 1 + 1, -1, -1]]), } utils.write_jsonl(self.temp_file_name, input_data) use_weak_label = True dataset = BootlegDataset( self.args, name="Bootleg_test", dataset=self.temp_file_name, use_weak_label=use_weak_label, tokenizer=self.tokenizer, entity_symbols=self.entity_symbols, dataset_threads=2, split="train", is_bert=True, ) # As the gold is the same for all, the order won't matter assert torch.equal(Y_dict["gold_type_id"], dataset.Y_dict["gold_type_id"])
def test_subsent_data(self): """ENTITY SYMBOLS { "multi word alias2":[["Q2",5.0],["Q1",3.0],["Q4",2.0]], "alias1":[["Q1",10.0],["Q4",6.0]], "alias3":[["Q1",30.0]], "alias4":[["Q4",20.0],["Q3",15.0],["Q2",1.0]] } TYPE LABELS { "Q1": [0, 1], "Q2": [2], "Q3": [], "Q4": [1] } """ # Test 1: the sentence is long and has far apart aliases so it gets split up into two subsentences; # the types should follow max_seq_len = 7 max_aliases = 4 self.args.data_config.max_aliases = max_aliases self.args.data_config.max_seq_len = max_seq_len input_data = [{ "aliases": ["alias3", "alias4"], "qids": ["Q1", "Q2"], "sent_idx_unq": 0, "sentence": "alias3 cat cat cat cat cat cat alias4", "spans": [[0, 1], [7, 8]], "gold": [True, True], }] Y_dict = { "gold_type_id": torch.tensor([[0 + 1, -1, -1, -1], [2 + 1, -1, -1, -1]]), } utils.write_jsonl(self.temp_file_name, input_data) use_weak_label = True dataset = BootlegDataset( self.args, name="Bootleg_test", dataset=self.temp_file_name, use_weak_label=use_weak_label, tokenizer=self.tokenizer, entity_symbols=self.entity_symbols, dataset_threads=1, split="train", is_bert=True, ) assert torch.equal(Y_dict["gold_type_id"], dataset.Y_dict["gold_type_id"])
def test_masked_aliases(self): """ENTITY SYMBOLS { "multi word alias2":[["Q2",5.0],["Q1",3.0],["Q4",2.0]], "alias1":[["Q1",10.0],["Q4",6.0]], "alias3":[["Q1",30.0]], "alias4":[["Q4",20.0],["Q3",15.0],["Q2",1.0]] } TYPE LABELS { "Q1": [0, 1], "Q2": [2], "Q3": [], "Q4": [1] } """ # Test 1: this sentence gets split into two with two aliases each # (the first alias of second split masked out). The types should follow this trend. Since split is train, # they also ignore the gold. max_seq_len = 7 max_aliases = 2 self.args.data_config.max_aliases = max_aliases self.args.data_config.max_seq_len = max_seq_len input_data = [{ "aliases": ["alias3", "alias4", "alias3"], "qids": ["Q1", "Q4", "Q1"], "sent_idx_unq": 0, "sentence": "alias3 alias4 alias3", "spans": [[0, 1], [1, 2], [2, 3]], "gold": [True, False, False], }] Y_dict = { "gold_type_id": torch.tensor([[0 + 1, 1 + 1], [-1, 0 + 1]]), } utils.write_jsonl(self.temp_file_name, input_data) use_weak_label = True dataset = BootlegDataset( self.args, name="Bootleg_test", dataset=self.temp_file_name, use_weak_label=use_weak_label, tokenizer=self.tokenizer, entity_symbols=self.entity_symbols, dataset_threads=1, split="train", is_bert=True, ) assert torch.equal(Y_dict["gold_type_id"], dataset.Y_dict["gold_type_id"]) # Test 2: with the split of "dev", the subsentences should remain unchanged but the # true index in Y_dict should be -1 max_seq_len = 7 max_aliases = 2 split = "dev" self.args.data_config.max_aliases = max_aliases self.args.data_config.max_seq_len = max_seq_len input_data = [{ "aliases": ["alias3", "alias4", "alias3"], "qids": ["Q1", "Q4", "Q1"], "sent_idx_unq": 0, "sentence": "alias3 alias4 alias3", "spans": [[0, 1], [1, 2], [2, 3]], "gold": [True, False, False], }] Y_dict = { "gold_type_id": torch.tensor([[0 + 1, -1], [-1, -1]]), } utils.write_jsonl(self.temp_file_name, input_data) use_weak_label = True dataset = BootlegDataset( self.args, name="Bootleg_test", dataset=self.temp_file_name, use_weak_label=use_weak_label, tokenizer=self.tokenizer, entity_symbols=self.entity_symbols, dataset_threads=1, split=split, is_bert=True, ) assert torch.equal(Y_dict["gold_type_id"], dataset.Y_dict["gold_type_id"])