def __init__(self, dataset_dir, split_str, labels_path, transform_fns): """ """ split_path = path.join(dataset_dir, f'{split_str}.csv') self.split_df = pd.read_csv(split_path, index_col=0) self.split_str = split_str self.labels_df = pd.read_csv(labels_path, index_col=0, header=[0, 1]) self.exam_ids = list(self.split_df.index.unique()) self.transform_fns = transform_fns self.shuffle_transform = 'shuffle' in [f['fn'] for f in transform_fns] self.instance_transform = None for f in transform_fns: # only extract instances if asked to do so and specified for split if 'extract_instance' == f['fn'] and split_str in f['args']['splits']: self.instance_transform = f['args'] logger.info(f"using instance extraction on {f['args']['splits']} splits") break if self.instance_transform != None and self.instance_transform.get('instance_only', False): # only access exam_ids with instance level labels exam_ids = [] for exam_id in self.exam_ids: rows = self.split_df.loc[exam_id] if isinstance(rows, pd.Series): if not np.isnan(rows['label.lv']): exam_ids.append(exam_id) else: if not np.isnan(rows.iloc[0]['label.lv']): exam_ids.append(exam_id) logger.info(f'using {len(exam_ids)} of {len(self.exam_ids)} exam_ids') self.exam_ids = exam_ids else: logger.info(f'using {len(self.exam_ids)} exam_ids') X_dict = {'exam_ids': []} Y_dict = { 'primary': [], 'primary_multiclass': [], '2normal_binary': [] # labels: control, 1,2 (normal) | 3,4 (abnormal), } for idx, exam_id in enumerate(self.exam_ids): X_dict['exam_ids'].append(exam_id) y_dict = self.get_y(exam_id) for t, label in y_dict.items(): Y_dict[t].append(label) Y_dict = {k: torch.from_numpy(np.array(v)) for k, v in Y_dict.items()} EmmentalDataset.__init__(self, 'cow-tus-dataset', X_dict=X_dict, Y_dict=Y_dict)
def create_dataloaders(task_name, dataset, batch_size, word2id, oov="~#OoV#~"): # Create dataloaders oov_id = word2id[oov] dataloaders = [] for split in ["train", "valid", "test"]: split_x, split_y = dataset[split] split_x = [ torch.LongTensor([word2id.get(w, oov_id) for w in seq]) for seq in split_x ] dataloaders.append( EmmentalDataLoader( task_to_label_dict={task_name: "label"}, dataset=EmmentalDataset( name=task_name, X_dict={"feature": split_x}, Y_dict={"label": split_y}, ), split=split, batch_size=batch_size, shuffle=True if split == "train" else False, )) logger.info( f"Loaded {split} for {task_name} containing {len(split_x)} samples." ) return dataloaders
def test_mixed_scheduler(caplog): """Unit test of mixed scheduler""" caplog.set_level(logging.INFO) emmental.Meta.init() task1 = "task1" x1 = np.random.rand(20, 2) y1 = torch.from_numpy(np.random.rand(20)) task2 = "task2" x2 = np.random.rand(30, 3) y2 = torch.from_numpy(np.random.rand(30)) dataloaders = [ EmmentalDataLoader( task_to_label_dict={task_name: "label"}, dataset=EmmentalDataset(name=task_name, X_dict={"feature": x}, Y_dict={"label": y}), split="train", batch_size=10, shuffle=True, ) for task_name, x, y in [(task1, x1, y1), (task2, x2, y2)] ] scheduler = MixedScheduler() assert scheduler.get_num_batches(dataloaders) == 2 batch_task_names_1 = [ batch_data[0][-2] for batch_data in scheduler.get_batches(dataloaders) ] batch_task_names_2 = [ batch_data[1][-2] for batch_data in scheduler.get_batches(dataloaders) ] assert batch_task_names_1 == [task1, task1] assert batch_task_names_2 == [task2, task2] scheduler = MixedScheduler(fillup=True) assert scheduler.get_num_batches(dataloaders) == 3 batch_task_names_1 = [ batch_data[0][-2] for batch_data in scheduler.get_batches(dataloaders) ] batch_task_names_2 = [ batch_data[1][-2] for batch_data in scheduler.get_batches(dataloaders) ] assert batch_task_names_1 == [task1, task1, task1] assert batch_task_names_2 == [task2, task2, task2]
def test_emmental_dataset(caplog): """Unit test of emmental dataset""" caplog.set_level(logging.INFO) x1 = [ torch.Tensor([1]), torch.Tensor([1, 2]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3, 4, 5]), ] y1 = torch.Tensor([0, 0, 0, 0, 0]) dataset = EmmentalDataset(X_dict={"data1": x1}, Y_dict={"label1": y1}, name="new_data") # Check if the dataset is correctly constructed assert torch.equal(dataset[0][0]["data1"], x1[0]) assert torch.equal(dataset[0][1]["label1"], y1[0]) x2 = [ torch.Tensor([1, 2, 3, 4, 5]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2]), torch.Tensor([1]), ] dataset.add_features(X_dict={"data2": x2}) # Check add one more feature to dataset assert torch.equal(dataset[0][0]["data2"], x2[0]) y2 = torch.Tensor([1, 1, 1, 1, 1]) dataset.add_labels(Y_dict={"label2": y2}) # Check add one more label to dataset assert torch.equal(dataset[0][1]["label2"], y2[0]) dataset.remove_label(label_name="label1") # Check remove one more label to dataset assert "label1" not in dataset.Y_dict
def test_round_robin_scheduler(caplog): """Unit test of round robin scheduler.""" caplog.set_level(logging.INFO) emmental.Meta.init() # Set random seed seed set_random_seed(2) task1 = "task1" x1 = np.random.rand(20, 2) y1 = torch.from_numpy(np.random.rand(20)) task2 = "task2" x2 = np.random.rand(30, 3) y2 = torch.from_numpy(np.random.rand(30)) dataloaders = [ EmmentalDataLoader( task_to_label_dict={task_name: "label"}, dataset=EmmentalDataset( name=task_name, X_dict={"feature": x}, Y_dict={"label": y} ), split="train", batch_size=10, shuffle=True, ) for task_name, x, y in [(task1, x1, y1), (task2, x2, y2)] ] scheduler = RoundRobinScheduler() assert scheduler.get_num_batches(dataloaders) == 5 batch_task_names = [ batch_data[-2] for batch_data in scheduler.get_batches(dataloaders) ] assert batch_task_names == [task2, task1, task2, task2, task1] scheduler = RoundRobinScheduler(fillup=True) assert scheduler.get_num_batches(dataloaders) == 6 batch_task_names = [ batch_data[-2] for batch_data in scheduler.get_batches(dataloaders) ] assert batch_task_names == [task2, task1, task2, task2, task1, task1]
def __init__(self, dataset_dir, split_str, labels_path, transform_fns): """ """ split_path = path.join(dataset_dir, f'{split_str}.csv') self.split_df = pd.read_csv(split_path, index_col=0) self.labels_df = pd.read_csv(labels_path, index_col=0, header=[0, 1]) self.loop_idxs = range(len(self.split_df)) # use df.iloc because loop_ids are not unique... loop_idxs = [] for loop_idx in self.loop_idxs: row = self.split_df.iloc[loop_idx] loop_type = row['exdir.loop_type'] if f'label.{loop_type}' in row.keys() and not np.isnan(row[f'label.{loop_type}']): loop_idxs.append(loop_idx) logger.info(f'using {len(loop_idxs)} of {len(self.loop_idxs)} loop_idxs') self.loop_idxs = loop_idxs self.transform_fns = transform_fns X_dict = {'loop_idxs': []} Y_dict = { 'primary': [], 'primary_multiclass': [], '2normal_binary': [] # labels: control, 1,2 (normal) | 3,4 (abnormal), } for idx, loop_idx in enumerate(self.loop_idxs): X_dict['loop_idxs'].append(loop_idx) y_dict = self.get_y(loop_idx) for t, label in y_dict.items(): Y_dict[t].append(label) Y_dict = {k: torch.from_numpy(np.array(v)) for k, v in Y_dict.items()} EmmentalDataset.__init__(self, 'cow-tus-dataset', X_dict=X_dict, Y_dict=Y_dict)
def wrapped_f(dataset): X_dict = defaultdict(list) Y_dict = defaultdict(list) examples = [] for x_dict, y_dict in dataset: # TODO: Consider making sure aug_x_dict is not x_dict! aug_x_dict, aug_y_dict = f(x_dict, y_dict) if aug_x_dict is not None and aug_y_dict is not None: examples.append((aug_x_dict, aug_y_dict)) for x_dict, y_dict in examples: for k, v in x_dict.items(): X_dict[k].append(v) for k, v in y_dict.items(): Y_dict[k].append(v) for k, v in Y_dict.items(): Y_dict[k] = list_to_tensor(v) # X_dict, Y_dict = emmental_collate_fn(examples) aug_dataset = EmmentalDataset(name=f.__name__, X_dict=X_dict, Y_dict=Y_dict) logger.info( f"Total {len(aug_dataset)} augmented examples were created " f"from AF {f.__name__}") return aug_dataset
def test_emmental_dataloader(caplog): """Unit test of emmental dataloader.""" caplog.set_level(logging.INFO) x1 = [ torch.Tensor([1]), torch.Tensor([1, 2]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3, 4, 5]), ] y1 = torch.Tensor([0, 0, 0, 0, 0]) x2 = [ torch.Tensor([1, 2, 3, 4, 5]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2]), torch.Tensor([1]), ] y2 = torch.Tensor([1, 1, 1, 1, 1]) dataset = EmmentalDataset( X_dict={"data1": x1, "data2": x2}, Y_dict={"label1": y1, "label2": y2}, name="new_data", ) dataloader1 = EmmentalDataLoader( task_to_label_dict={"task1": "label1"}, dataset=dataset, split="train", batch_size=2, ) x_batch, y_batch = next(iter(dataloader1)) # Check if the dataloader is correctly constructed assert dataloader1.task_to_label_dict == {"task1": "label1"} assert dataloader1.split == "train" assert torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]])) assert torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]) ) assert torch.equal(y_batch["label1"], torch.Tensor([0, 0])) assert torch.equal(y_batch["label2"], torch.Tensor([1, 1])) dataloader2 = EmmentalDataLoader( task_to_label_dict={"task2": "label2"}, dataset=dataset, split="test", batch_size=3, ) x_batch, y_batch = next(iter(dataloader2)) # Check if the dataloader with differet batch size is correctly constructed assert dataloader2.task_to_label_dict == {"task2": "label2"} assert dataloader2.split == "test" assert torch.equal( x_batch["data1"], torch.Tensor([[1, 0, 0], [1, 2, 0], [1, 2, 3]]) ) assert torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) assert torch.equal(y_batch["label1"], torch.Tensor([0, 0, 0])) assert torch.equal(y_batch["label2"], torch.Tensor([1, 1, 1])) y3 = [ torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), ] dataset.Y_dict["label2"] = y3 x_batch, y_batch = next(iter(dataloader1)) # Check dataloader is correctly updated with update dataset assert torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]) ) assert torch.equal(y_batch["label2"], torch.Tensor([[2], [2]])) x_batch, y_batch = next(iter(dataloader2)) assert torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) assert torch.equal(y_batch["label2"], torch.Tensor([[2], [2], [2]]))
def load_data_from_db(postgres_db_name, postgres_db_location, label_dict, char_dict=None, clobber_label=True): """Load data from database. """ print(f"Loading data from db {postgres_db_name}") # Start DB connection conn_string = os.path.join(postgres_db_location, postgres_db_name) session = Meta.init(conn_string).Session() # Printing number of docs/sentences print("==============================") print(f"DB contents for {postgres_db_name}:") print(f"Number of documents: {session.query(Document).count()}") print("==============================") docs = session.query(Document).all() uid_field = [] text_field = [] label_field = [] missed_ids = 0 term = r"([Ll]ocation:[\w\W]{1,200}</.{0,20}>|\W[cC]ity:[\w\W]{1,200}</.{0,20}>|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)" for doc in docs: if (doc.name in label_dict) or clobber_label: uid_field.append(doc.name) text_field.append(get_posting_html_fast(doc.text, term)) if not clobber_label: label_field.append(label_dict[doc.name]) else: label_field.append(-1) else: missed_ids += 1 # Printing data stats print("==============================") print(f"Loaded {len(uid_field)} ids") print(f"Loaded {len(text_field)} text") print(f"Loaded {len(label_field)} labels") print(f"Missed {missed_ids} samples") X_dict = {"text": text_field, "uid": uid_field} Y_dict = {"label": torch.from_numpy(np.array(label_field))} dataset = EmmentalDataset(name="HT", X_dict=X_dict, Y_dict=Y_dict, uid="uid") emb_field = [] for i in range(len(dataset)): emb_field.append( torch.from_numpy( np.array(list(map(char_dict.lookup, dataset[i][0]['text']))))) dataset.add_features({"emb": emb_field}) return dataset
def test_emmental_dataset(caplog): """Unit test of emmental dataset.""" caplog.set_level(logging.INFO) dirpath = "temp_test_data" Meta.reset() emmental.init(dirpath) x1 = [ torch.Tensor([1]), torch.Tensor([1, 2]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3, 4, 5]), ] y1 = torch.Tensor([0, 0, 0, 0, 0]) dataset = EmmentalDataset(X_dict={"data1": x1}, Y_dict={"label1": y1}, name="new_data") # Check if the dataset is correctly constructed assert torch.equal(dataset[0][0]["data1"], x1[0]) assert torch.equal(dataset[0][1]["label1"], y1[0]) x2 = [ torch.Tensor([1, 2, 3, 4, 5]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2]), torch.Tensor([1]), ] dataset.add_features(X_dict={"data2": x2}) dataset.remove_feature("data2") assert "data2" not in dataset.X_dict dataset.add_features(X_dict={"data2": x2}) # Check add one more feature to dataset assert torch.equal(dataset[0][0]["data2"], x2[0]) y2 = torch.Tensor([1, 1, 1, 1, 1]) dataset.add_labels(Y_dict={"label2": y2}) with pytest.raises(ValueError): dataset.add_labels(Y_dict={"label2": x2}) # Check add one more label to dataset assert torch.equal(dataset[0][1]["label2"], y2[0]) dataset.remove_label(label_name="label1") # Check remove one more label to dataset assert "label1" not in dataset.Y_dict with pytest.raises(ValueError): dataset = EmmentalDataset(X_dict={"data1": x1}, Y_dict={"label1": y1}, name="new_data", uid="ids") dataset = EmmentalDataset(X_dict={"_uids_": x1}, Y_dict={"label1": y1}, name="new_data") dataset = EmmentalDataset(X_dict={"data1": x1}, name="new_data") # Check if the dataset is correctly constructed assert torch.equal(dataset[0]["data1"], x1[0]) dataset.add_features(X_dict={"data2": x2}) dataset.remove_feature("data2") assert "data2" not in dataset.X_dict dataset.add_features(X_dict={"data2": x2}) # Check add one more feature to dataset assert torch.equal(dataset[0]["data2"], x2[0]) y2 = torch.Tensor([1, 1, 1, 1, 1]) dataset.add_labels(Y_dict={"label2": y2}) # Check add one more label to dataset assert torch.equal(dataset[0][1]["label2"], y2[0]) shutil.rmtree(dirpath)
def parse(jsonl_path, tokenizer, uid, max_data_samples, max_sequence_length): logger.info(f"Loading data from {jsonl_path}.") rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")] for i in range(2): logger.info(f"Sample {i}: {rows[i]}") # Truncate to max_data_samples if max_data_samples: rows = rows[:max_data_samples] logger.info(f"Truncating to {max_data_samples} samples.") # unique ids uids = [] # sentence1 text sentence1s = [] # sentence2 text sentence2s = [] # label labels = [] bert_token_ids = [] bert_token_masks = [] bert_token_segments = [] # Check the maximum token length max_len = -1 for row in rows: index = row["idx"] sentence1 = row["premise"] sentence2 = row["hypothesis"] label = row["label"] if "label" in row else "entailment" uids.append(index) sentence1s.append(sentence1) sentence2s.append(sentence2) labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label]) # Tokenize sentences sent1_tokens = tokenizer.tokenize(sentence1) sent2_tokens = tokenizer.tokenize(sentence2) if len(sent1_tokens) + len(sent2_tokens) > max_len: max_len = len(sent1_tokens) + len(sent2_tokens) while True: total_length = len(sent1_tokens) + len(sent2_tokens) # Account for [CLS], [SEP], [SEP] with "- 3" if total_length <= max_sequence_length - 3: break if len(sent1_tokens) > len(sent2_tokens): sent1_tokens.pop() else: sent2_tokens.pop() # Convert to BERT manner tokens = ["[CLS]"] + sent1_tokens + ["[SEP]"] token_segments = [0] * len(tokens) tokens += sent2_tokens + ["[SEP]"] token_segments += [1] * (len(sent2_tokens) + 1) token_ids = tokenizer.convert_tokens_to_ids(tokens) # Generate mask where 1 for real tokens and 0 for padding tokens token_masks = [1] * len(token_ids) bert_token_ids.append(torch.LongTensor(token_ids)) bert_token_masks.append(torch.LongTensor(token_masks)) bert_token_segments.append(torch.LongTensor(token_segments)) labels = torch.from_numpy(np.array(labels)) logger.info(f"Max token len {max_len}") return EmmentalDataset( name="SuperGLUE", uid="uids", X_dict={ "uids": uids, "sentence1": sentence1s, "sentence2": sentence2s, "token_ids": bert_token_ids, "token_masks": bert_token_masks, "token_segments": bert_token_segments, }, Y_dict={"labels": labels}, )
def parse(jsonl_path, tokenizer, uid, max_data_samples, max_sequence_length): logger.info(f"Loading data from {jsonl_path}.") rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")] for i in range(2): logger.info(f"Sample {i}: {rows[i]}") # Truncate to max_data_samples if max_data_samples: rows = rows[:max_data_samples] logger.info(f"Truncating to {max_data_samples} samples.") # unique ids uids = [] # sentence1 sent1s = [] # sentence2 sent2s = [] # choice1 choice1s = [] # choice2 choice2s = [] labels = [] bert_token1_ids = [] bert_token2_ids = [] bert_token1_masks = [] bert_token2_masks = [] bert_token1_segments = [] bert_token2_segments = [] # Check the maximum token length max_len = -1 for sample in rows: index = sample["idx"] sent1 = sample["premise"] sent2 = sample["question"] sent2 = ("What was the cause of this?" if sent2 == "cause" else "What happened as a result?") choice1 = sample["choice1"] choice2 = sample["choice2"] label = sample["label"] if "label" in sample else True uids.append(index) sent1s.append(sent1) sent2s.append(sent2) choice1s.append(choice1) choice2s.append(choice2) labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label]) # Tokenize sentences sent1_tokens = tokenizer.tokenize(sent1) sent2_tokens = tokenizer.tokenize(sent2) # Tokenize choices choice1_tokens = tokenizer.tokenize(choice1) choice2_tokens = tokenizer.tokenize(choice2) # Convert to BERT manner tokens1 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens + ["[SEP]"] + choice1_tokens + ["[SEP]"]) tokens2 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens + ["[SEP]"] + choice2_tokens + ["[SEP]"]) token1_ids = tokenizer.convert_tokens_to_ids(tokens1) token2_ids = tokenizer.convert_tokens_to_ids(tokens2) padding1 = [0] * (max_sequence_length - len(token1_ids)) padding2 = [0] * (max_sequence_length - len(token2_ids)) token1_masks = [1] * len(token1_ids) token2_masks = [1] * len(token2_ids) token1_segments = [0] * len(token1_ids) token2_segments = [0] * len(token2_ids) token1_ids += padding1 token2_ids += padding2 token1_masks += padding1 token2_masks += padding2 token1_segments += padding1 token2_segments += padding2 if len(token1_ids) > max_len: max_len = len(token1_ids) if len(token2_ids) > max_len: max_len = len(token2_ids) bert_token1_ids.append(torch.LongTensor(token1_ids)) bert_token2_ids.append(torch.LongTensor(token2_ids)) bert_token1_masks.append(torch.LongTensor(token1_masks)) bert_token2_masks.append(torch.LongTensor(token2_masks)) bert_token1_segments.append(torch.LongTensor(token1_segments)) bert_token2_segments.append(torch.LongTensor(token2_segments)) labels = torch.from_numpy(np.array(labels)) logger.info(f"Max token len {max_len}") return EmmentalDataset( name="SuperGLUE", uid="uids", X_dict={ "uids": uids, "sentence1": sent1s, "sentence2": sent2s, "choice1": choice1s, "choice2": choice2s, "token1_ids": bert_token1_ids, "token2_ids": bert_token2_ids, "token1_masks": bert_token1_masks, "token2_masks": bert_token2_masks, "token1_segments": bert_token1_segments, "token2_segments": bert_token2_segments, }, Y_dict={"labels": labels}, )
def test_emmental_dataloader(caplog): """Unit test of emmental dataloader.""" caplog.set_level(logging.INFO) dirpath = "temp_test_data" Meta.reset() emmental.init(dirpath) x1 = [ torch.Tensor([1]), torch.Tensor([1, 2]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3, 4, 5]), ] y1 = torch.Tensor([0, 0, 0, 0, 0]) x2 = [ torch.Tensor([1, 2, 3, 4, 5]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2]), torch.Tensor([1]), ] y2 = torch.Tensor([1, 1, 1, 1, 1]) dataset = EmmentalDataset( X_dict={ "data1": x1, "data2": x2 }, Y_dict={ "label1": y1, "label2": y2 }, name="new_data", ) dataloader1 = EmmentalDataLoader( task_to_label_dict={"task1": "label1"}, dataset=dataset, split="train", batch_size=2, num_workers=2, ) x_batch, y_batch = next(iter(dataloader1)) # Check if the dataloader is correctly constructed assert dataloader1.task_to_label_dict == {"task1": "label1"} assert dataloader1.split == "train" assert torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]])) assert torch.equal(x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]])) assert torch.equal(y_batch["label1"], torch.Tensor([0, 0])) assert torch.equal(y_batch["label2"], torch.Tensor([1, 1])) dataloader2 = EmmentalDataLoader( task_to_label_dict={"task2": "label2"}, dataset=dataset, split="test", batch_size=3, collate_fn=partial(emmental_collate_fn, min_data_len=0, max_data_len=0), ) x_batch, y_batch = next(iter(dataloader2)) # Check if the dataloader with different batch size is correctly constructed assert dataloader2.task_to_label_dict == {"task2": "label2"} assert dataloader2.split == "test" assert torch.equal(x_batch["data1"], torch.Tensor([[1, 0, 0], [1, 2, 0], [1, 2, 3]])) assert torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) assert torch.equal(y_batch["label1"], torch.Tensor([0, 0, 0])) assert torch.equal(y_batch["label2"], torch.Tensor([1, 1, 1])) y3 = [ torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), ] dataset.Y_dict["label2"] = y3 x_batch, y_batch = next(iter(dataloader1)) # Check dataloader is correctly updated with update dataset assert torch.equal(x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]])) assert torch.equal(y_batch["label2"], torch.Tensor([[2], [2]])) x_batch, y_batch = next(iter(dataloader2)) assert torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) assert torch.equal(y_batch["label2"], torch.Tensor([[2], [2], [2]])) dataset = EmmentalDataset(X_dict={"data1": x1}, name="new_data") dataloader3 = EmmentalDataLoader(task_to_label_dict={"task1": None}, dataset=dataset, split="train", batch_size=2) x_batch = next(iter(dataloader3)) # Check if the dataloader is correctly constructed assert dataloader3.task_to_label_dict == {"task1": None} assert dataloader3.split == "train" assert torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]])) # Check there is an error if task_to_label_dict has task to label mapping while # no y_dict in dataset with pytest.raises(ValueError): EmmentalDataLoader( task_to_label_dict={"task1": "label1"}, dataset=dataset, split="train", batch_size=2, ) shutil.rmtree(dirpath)
def parse(jsonl_path, tokenizer, uid, max_data_samples, max_sequence_length): logger.info(f"Loading data from {jsonl_path}.") rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")] for i in range(2): logger.info(f"Sample {i}: {rows[i]}") # Truncate to max_data_samples if max_data_samples: rows = rows[:max_data_samples] logger.info(f"Truncating to {max_data_samples} samples.") # unique ids uids = [] # sentence1 text sentence1s = [] # sentence2 text sentence2s = [] # sentence1 idx sentence1_idxs = [] # sentence2 idx sentence2_idxs = [] # word in common words = [] # pos tag poses = [] # label labels = [] token1_idxs = [] token2_idxs = [] bert_token_ids = [] bert_token_masks = [] bert_token_segments = [] # Check the maximum token length max_len = -1 for idx, row in enumerate(rows): sentence1 = row["sentence1"] sentence2 = row["sentence2"] word = row["word"] pos = row["pos"] sentence1_idx = int(row["sentence1_idx"]) sentence2_idx = int(row["sentence2_idx"]) label = row["label"] if "label" in row else True uids.append(idx) sentence1s.append(sentence1) sentence2s.append(sentence2) sentence1_idxs.append(sentence1_idx) sentence2_idxs.append(sentence2_idx) words.append(word) poses.append(pos) labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label]) # Tokenize sentences sent1_tokens = tokenizer.tokenize(sentence1) sent2_tokens = tokenizer.tokenize(sentence2) word_tokens_in_sent1 = tokenizer.tokenize( sentence1.split()[sentence1_idx]) word_tokens_in_sent2 = tokenizer.tokenize( sentence2.split()[sentence2_idx]) while True: total_length = len(sent1_tokens) + len(sent2_tokens) if total_length > max_len: max_len = total_length # Account for [CLS], [SEP], [SEP] with "- 3" if total_length <= max_sequence_length - 3: break if len(sent1_tokens) > len(sent2_tokens): sent1_tokens.pop() else: sent2_tokens.pop() for idx in range(sentence1_idx - 1, len(sent1_tokens)): if (sent1_tokens[idx:idx + len(word_tokens_in_sent1)] == word_tokens_in_sent1): token1_idxs.append(idx + 2) # Add [CLS] break for idx in range(sentence2_idx - 1, len(sent2_tokens)): if (sent2_tokens[idx:idx + len(word_tokens_in_sent2)] == word_tokens_in_sent2): token2_idxs.append( idx + len(sent1_tokens) + 1 ) # Add the length of the first sentence and [CLS] + [SEP] break # Convert to BERT manner tokens = sent1_tokens + ["[SEP]"] token_segments = [0] * len(tokens) tokens += sent2_tokens + ["[SEP]"] + ["[CLS]"] token_segments += [1] * (len(sent2_tokens) + 1) token_ids = tokenizer.convert_tokens_to_ids(tokens) # Generate mask where 1 for real tokens and 0 for padding tokens token_masks = [1] * len(token_ids) bert_token_ids.append(torch.LongTensor(token_ids)) bert_token_masks.append(torch.LongTensor(token_masks)) bert_token_segments.append(torch.LongTensor(token_segments)) token1_idxs = torch.from_numpy(np.array(token1_idxs)) token2_idxs = torch.from_numpy(np.array(token2_idxs)) labels = torch.from_numpy(np.array(labels)) logger.info(f"Max token len {max_len}") return EmmentalDataset( name="SuperGLUE", uid="uids", X_dict={ "uids": uids, "sentence1": sentence1s, "sentence2": sentence2s, "word": words, "pos": poses, "sentence1_idx": sentence1_idxs, "sentence2_idx": sentence2_idxs, "token1_idx": token1_idxs, "token2_idx": token2_idxs, "token_ids": bert_token_ids, "token_masks": bert_token_masks, "token_segments": bert_token_segments, }, Y_dict={"labels": labels}, )
bert_model_name=args.bert_model, max_data_samples=args.max_data_samples, max_sequence_length=args.max_sequence_length, ) X_dict = { "token_ids": token_ids, "token_segments": token_segments, "token_masks": token_masks, } Y_dict = {"labels": labels} if task_name not in datasets: datasets[task_name] = {} datasets[task_name][split] = EmmentalDataset( name="GLUE", X_dict=X_dict, Y_dict=Y_dict ) logger.info(f"Loaded {split} for {task_name}.") dataloaders = [] for task_name in args.task: for split in ["train", "dev", "test"]: dataloaders.append( EmmentalDataLoader( task_to_label_dict={task_name: "labels"}, dataset=datasets[task_name][split], split=split, batch_size=args.batch_size, shuffle=True if split == "train" else False,
def test_e2e(caplog): """Run an end-to-end test.""" caplog.set_level(logging.INFO) dirpath = "temp_test_e2e" use_exact_log_path = False Meta.reset() emmental.init(dirpath, use_exact_log_path=use_exact_log_path) config = { "meta_config": { "seed": 0 }, "learner_config": { "n_epochs": 3, "optimizer_config": { "lr": 0.01, "grad_clip": 100 }, }, "logging_config": { "counter_unit": "epoch", "evaluation_freq": 1, "writer_config": { "writer": "tensorboard", "verbose": True }, "checkpointing": True, "checkpointer_config": { "checkpoint_path": None, "checkpoint_freq": 1, "checkpoint_metric": { "model/all/train/loss": "min" }, "checkpoint_task_metrics": None, "checkpoint_runway": 1, "checkpoint_all": False, "clear_intermediate_checkpoints": True, "clear_all_checkpoints": True, }, }, } emmental.Meta.update_config(config) # Generate synthetic data N = 500 X = np.random.random((N, 2)) * 2 - 1 Y1 = (X[:, 0] > X[:, 1] + 0.25).astype(int) Y2 = (X[:, 0] > X[:, 1] + 0.2).astype(int) X = [torch.Tensor(X[i]) for i in range(N)] # Create dataset and dataloader X_train, X_dev, X_test = ( X[:int(0.8 * N)], X[int(0.8 * N):int(0.9 * N)], X[int(0.9 * N):], ) Y1_train, Y1_dev, Y1_test = ( torch.tensor(Y1[:int(0.8 * N)]), torch.tensor(Y1[int(0.8 * N):int(0.9 * N)]), torch.tensor(Y1[int(0.9 * N):]), ) Y2_train, Y2_dev, Y2_test = ( torch.tensor(Y2[:int(0.8 * N)]), torch.tensor(Y2[int(0.8 * N):int(0.9 * N)]), torch.tensor(Y2[int(0.9 * N):]), ) train_dataset1 = EmmentalDataset(name="synthetic", X_dict={"data": X_train}, Y_dict={"label1": Y1_train}) train_dataset2 = EmmentalDataset(name="synthetic", X_dict={"data": X_train}, Y_dict={"label2": Y2_train}) dev_dataset1 = EmmentalDataset(name="synthetic", X_dict={"data": X_dev}, Y_dict={"label1": Y1_dev}) dev_dataset2 = EmmentalDataset(name="synthetic", X_dict={"data": X_dev}, Y_dict={"label2": Y2_dev}) test_dataset1 = EmmentalDataset(name="synthetic", X_dict={"data": X_test}, Y_dict={"label1": Y1_test}) test_dataset2 = EmmentalDataset(name="synthetic", X_dict={"data": X_test}, Y_dict={"label2": Y2_test}) task_to_label_dict = {"task1": "label1"} train_dataloader1 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=train_dataset1, split="train", batch_size=10, ) dev_dataloader1 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=dev_dataset1, split="valid", batch_size=10, ) test_dataloader1 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=test_dataset1, split="test", batch_size=10, ) task_to_label_dict = {"task2": "label2"} train_dataloader2 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=train_dataset2, split="train", batch_size=10, ) dev_dataloader2 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=dev_dataset2, split="valid", batch_size=10, ) test_dataloader2 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=test_dataset2, split="test", batch_size=10, ) # Create task def ce_loss(task_name, immediate_ouput_dict, Y, active): module_name = f"{task_name}_pred_head" return F.cross_entropy(immediate_ouput_dict[module_name][0][active], (Y.view(-1))[active]) def output(task_name, immediate_ouput_dict): module_name = f"{task_name}_pred_head" return F.softmax(immediate_ouput_dict[module_name][0], dim=1) task_metrics = {"task1": ["accuracy"], "task2": ["accuracy", "roc_auc"]} tasks = [ EmmentalTask( name=task_name, module_pool=nn.ModuleDict({ "input_module": nn.Linear(2, 8), f"{task_name}_pred_head": nn.Linear(8, 2), }), task_flow=[ { "name": "input", "module": "input_module", "inputs": [("_input_", "data")], }, { "name": f"{task_name}_pred_head", "module": f"{task_name}_pred_head", "inputs": [("input", 0)], }, ], loss_func=partial(ce_loss, task_name), output_func=partial(output, task_name), scorer=Scorer(metrics=task_metrics[task_name]), ) for task_name in ["task1", "task2"] ] # Build model mtl_model = EmmentalModel(name="all", tasks=tasks) # Create learner emmental_learner = EmmentalLearner() # Learning emmental_learner.learn( mtl_model, [ train_dataloader1, train_dataloader2, dev_dataloader1, dev_dataloader2 ], ) test1_score = mtl_model.score(test_dataloader1) test2_score = mtl_model.score(test_dataloader2) assert test1_score["task1/synthetic/test/accuracy"] >= 0.7 assert (test1_score["model/all/test/macro_average"] == test1_score["task1/synthetic/test/accuracy"]) assert test2_score["task2/synthetic/test/accuracy"] >= 0.7 assert test2_score["task2/synthetic/test/roc_auc"] >= 0.7 shutil.rmtree(dirpath)
def test_e2e(caplog): """Run an end-to-end test.""" caplog.set_level(logging.INFO) dirpath = "temp_test_e2e" Meta.reset() emmental.init(dirpath) # Generate synthetic data N = 50 X = np.random.random((N, 2)) * 2 - 1 Y1 = (X[:, 0] > X[:, 1] + 0.25).astype(int) + 1 Y2 = (-X[:, 0] > X[:, 1] + 0.25).astype(int) + 1 # Create dataset and dataloader splits = [0.8, 0.1, 0.1] X_train, X_dev, X_test = [], [], [] Y1_train, Y1_dev, Y1_test = [], [], [] Y2_train, Y2_dev, Y2_test = [], [], [] for i in range(N): if i <= N * splits[0]: X_train.append(torch.Tensor(X[i])) Y1_train.append(Y1[i]) Y2_train.append(Y2[i]) elif i < N * (splits[0] + splits[1]): X_dev.append(torch.Tensor(X[i])) Y1_dev.append(Y1[i]) Y2_dev.append(Y2[i]) else: X_test.append(torch.Tensor(X[i])) Y1_test.append(Y1[i]) Y2_test.append(Y2[i]) Y1_train = torch.from_numpy(np.array(Y1_train)) Y1_dev = torch.from_numpy(np.array(Y1_dev)) Y1_test = torch.from_numpy(np.array(Y1_test)) Y2_train = torch.from_numpy(np.array(Y1_train)) Y2_dev = torch.from_numpy(np.array(Y2_dev)) Y2_test = torch.from_numpy(np.array(Y2_test)) train_dataset1 = EmmentalDataset( name="synthetic", X_dict={"data": X_train}, Y_dict={"label1": Y1_train} ) train_dataset2 = EmmentalDataset( name="synthetic", X_dict={"data": X_train}, Y_dict={"label2": Y2_train} ) dev_dataset1 = EmmentalDataset( name="synthetic", X_dict={"data": X_dev}, Y_dict={"label1": Y1_dev} ) dev_dataset2 = EmmentalDataset( name="synthetic", X_dict={"data": X_dev}, Y_dict={"label2": Y2_dev} ) test_dataset1 = EmmentalDataset( name="synthetic", X_dict={"data": X_test}, Y_dict={"label1": Y2_test} ) test_dataset2 = EmmentalDataset( name="synthetic", X_dict={"data": X_test}, Y_dict={"label2": Y2_test} ) task_to_label_dict = {"task1": "label1"} train_dataloader1 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=train_dataset1, split="train", batch_size=10, ) dev_dataloader1 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=dev_dataset1, split="valid", batch_size=10, ) test_dataloader1 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=test_dataset1, split="test", batch_size=10, ) task_to_label_dict = {"task2": "label2"} train_dataloader2 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=train_dataset2, split="train", batch_size=10, ) dev_dataloader2 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=dev_dataset2, split="valid", batch_size=10, ) test_dataloader2 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=test_dataset2, split="test", batch_size=10, ) # Create task def ce_loss(task_name, immediate_ouput_dict, Y, active): module_name = f"{task_name}_pred_head" return F.cross_entropy( immediate_ouput_dict[module_name][0][active], (Y.view(-1) - 1)[active] ) def output(task_name, immediate_ouput_dict): module_name = f"{task_name}_pred_head" return F.softmax(immediate_ouput_dict[module_name][0], dim=1) task_name = "task1" task1 = EmmentalTask( name=task_name, module_pool=nn.ModuleDict( {"input_module": nn.Linear(2, 8), f"{task_name}_pred_head": nn.Linear(8, 2)} ), task_flow=[ { "name": "input", "module": "input_module", "inputs": [("_input_", "data")], }, { "name": f"{task_name}_pred_head", "module": f"{task_name}_pred_head", "inputs": [("input", 0)], }, ], loss_func=partial(ce_loss, task_name), output_func=partial(output, task_name), scorer=Scorer(metrics=["accuracy", "roc_auc"]), ) task_name = "task2" task2 = EmmentalTask( name=task_name, module_pool=nn.ModuleDict( {"input_module": nn.Linear(2, 8), f"{task_name}_pred_head": nn.Linear(8, 2)} ), task_flow=[ { "name": "input", "module": "input_module", "inputs": [("_input_", "data")], }, { "name": f"{task_name}_pred_head", "module": f"{task_name}_pred_head", "inputs": [("input", 0)], }, ], loss_func=partial(ce_loss, task_name), output_func=partial(output, task_name), scorer=Scorer(metrics=["accuracy", "roc_auc"]), ) # Build model mtl_model = EmmentalModel(name="all", tasks=[task1, task2]) # Create learner emmental_learner = EmmentalLearner() # Update learning config Meta.update_config( config={"learner_config": {"n_epochs": 10, "optimizer_config": {"lr": 0.01}}} ) # Learning emmental_learner.learn( mtl_model, [train_dataloader1, train_dataloader2, dev_dataloader1, dev_dataloader2], ) test1_score = mtl_model.score(test_dataloader1) test2_score = mtl_model.score(test_dataloader2) assert test1_score["task1/synthetic/test/accuracy"] >= 0.5 assert test1_score["task1/synthetic/test/roc_auc"] >= 0.6 assert test2_score["task2/synthetic/test/accuracy"] >= 0.5 assert test2_score["task2/synthetic/test/roc_auc"] >= 0.6 shutil.rmtree(dirpath)
def parse(csv_path, tokenizer, uid, max_data_samples, max_sequence_length): logger.info(f"Loading data from {csv_path}.") rows = pd.read_csv(csv_path) # Truncate to max_data_samples if max_data_samples: rows = rows[:max_data_samples] logger.info(f"Truncating to {max_data_samples} samples.") # unique ids uids = [] # sentence1 sent1s = [] # sentence2 sent2s = [] # choice1 choice1s = [] # choice2 choice2s = [] # choice3 choice3s = [] # choice4 choice4s = [] labels = [] bert_token1_ids = [] bert_token2_ids = [] bert_token3_ids = [] bert_token4_ids = [] bert_token1_masks = [] bert_token2_masks = [] bert_token3_masks = [] bert_token4_masks = [] bert_token1_segments = [] bert_token2_segments = [] bert_token3_segments = [] bert_token4_segments = [] # Check the maximum token length max_len = -1 for ex_idx, ex in rows.iterrows(): sent1 = ex["sent1"] sent2 = ex["sent2"] choice1 = ex["ending0"] choice2 = ex["ending1"] choice3 = ex["ending2"] choice4 = ex["ending3"] label = ex["label"] if "label" in ex else 0 uids.append(ex_idx) sent1s.append(sent1) sent2s.append(sent2) choice1s.append(choice1) choice2s.append(choice2) choice3s.append(choice3) choice4s.append(choice4) labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label]) # Tokenize sentences sent1_tokens = tokenizer.tokenize(sent1) sent2_tokens = tokenizer.tokenize(sent2) choice1_tokens = tokenizer.tokenize(choice1) choice2_tokens = tokenizer.tokenize(choice2) choice3_tokens = tokenizer.tokenize(choice3) choice4_tokens = tokenizer.tokenize(choice4) # Convert to BERT manner token1 = (sent1_tokens + ["[SEP]"] + sent2_tokens + choice1_tokens + ["[SEP]"] + ["[CLS]"]) token2 = (sent1_tokens + ["[SEP]"] + sent2_tokens + choice2_tokens + ["[SEP]"] + ["[CLS]"]) token3 = (sent1_tokens + ["[SEP]"] + sent2_tokens + choice3_tokens + ["[SEP]"] + ["[CLS]"]) token4 = (sent1_tokens + ["[SEP]"] + sent2_tokens + choice4_tokens + ["[SEP]"] + ["[CLS]"]) max_choice_len = 0 token1_ids = tokenizer.convert_tokens_to_ids( token1)[:max_sequence_length] token2_ids = tokenizer.convert_tokens_to_ids( token2)[:max_sequence_length] token3_ids = tokenizer.convert_tokens_to_ids( token3)[:max_sequence_length] token4_ids = tokenizer.convert_tokens_to_ids( token4)[:max_sequence_length] token1_masks = [1] * len(token1_ids) token2_masks = [1] * len(token2_ids) token3_masks = [1] * len(token3_ids) token4_masks = [1] * len(token4_ids) token1_segments = [0] * len(token1_ids) token2_segments = [0] * len(token2_ids) token3_segments = [0] * len(token3_ids) token4_segments = [0] * len(token4_ids) if len(token1_ids) > max_len: max_len = len(token1_ids) if len(token2_ids) > max_len: max_len = len(token2_ids) if len(token3_ids) > max_len: max_len = len(token3_ids) if len(token4_ids) > max_len: max_len = len(token4_ids) max_choice_len = max(max_choice_len, len(token1_ids)) max_choice_len = max(max_choice_len, len(token2_ids)) max_choice_len = max(max_choice_len, len(token3_ids)) max_choice_len = max(max_choice_len, len(token4_ids)) token1_ids += [0] * (max_choice_len - len(token1_ids)) token2_ids += [0] * (max_choice_len - len(token2_ids)) token3_ids += [0] * (max_choice_len - len(token3_ids)) token4_ids += [0] * (max_choice_len - len(token4_ids)) token1_masks += [0] * (max_choice_len - len(token1_masks)) token2_masks += [0] * (max_choice_len - len(token2_masks)) token3_masks += [0] * (max_choice_len - len(token3_masks)) token4_masks += [0] * (max_choice_len - len(token4_masks)) token1_segments += [0] * (max_choice_len - len(token1_segments)) token2_segments += [0] * (max_choice_len - len(token2_segments)) token3_segments += [0] * (max_choice_len - len(token3_segments)) token4_segments += [0] * (max_choice_len - len(token4_segments)) bert_token1_ids.append(torch.LongTensor(token1_ids)) bert_token2_ids.append(torch.LongTensor(token2_ids)) bert_token3_ids.append(torch.LongTensor(token3_ids)) bert_token4_ids.append(torch.LongTensor(token4_ids)) bert_token1_masks.append(torch.LongTensor(token1_masks)) bert_token2_masks.append(torch.LongTensor(token2_masks)) bert_token3_masks.append(torch.LongTensor(token3_masks)) bert_token4_masks.append(torch.LongTensor(token4_masks)) bert_token1_segments.append(torch.LongTensor(token1_segments)) bert_token2_segments.append(torch.LongTensor(token2_segments)) bert_token3_segments.append(torch.LongTensor(token3_segments)) bert_token4_segments.append(torch.LongTensor(token4_segments)) labels = torch.from_numpy(np.array(labels)) logger.info(f"Max token len {max_len}") return EmmentalDataset( name="SuperGLUE", uid="uids", X_dict={ "uids": uids, "sentence1": sent1s, "sentence2": sent2s, "choice1": choice1s, "choice2": choice2s, "choice3": choice3s, "choice4": choice4s, "token1_ids": bert_token1_ids, "token2_ids": bert_token2_ids, "token3_ids": bert_token3_ids, "token4_ids": bert_token4_ids, "token1_masks": bert_token1_masks, "token2_masks": bert_token2_masks, "token3_masks": bert_token3_masks, "token4_masks": bert_token4_masks, "token1_segments": bert_token1_segments, "token2_segments": bert_token2_segments, "token3_segments": bert_token3_segments, "token4_segments": bert_token4_segments, }, Y_dict={"labels": labels}, )
def parse(jsonl_path, tokenizer, uid, max_data_samples, max_sequence_length): logger.info(f"Loading data from {jsonl_path}.") rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")] for i in range(2): logger.info(f"Sample {i}: {rows[i]}") # Truncate to max_data_samples if max_data_samples: rows = rows[:max_data_samples] logger.info(f"Truncating to {max_data_samples} samples.") # unique ids uids = [] # paragraph ids pids = [] # question ids qids = [] # answer ids aids = [] # paragraph text paras = [] # question text questions = [] # answer text answers = [] # labels labels = [] bert_tokens = [] bert_token_ids = [] bert_token_masks = [] bert_token_segments = [] # Check the maximum token length max_len = -1 for row in rows: # each example has a paragraph field -> (text, questions) # text is the paragraph, which requires some preprocessing # questions is a list of questions, # has fields (question, sentences_used, answers) pid = row["idx"] para = row["paragraph"]["text"] para_sent_list = re.sub( "<b>Sent .{1,2}: </b>", "", row["paragraph"]["text"] ).split("<br>") for ques in row["paragraph"]["questions"]: qid = ques["idx"] sent_used = ques["sentences_used"] if len(sent_used) > 0: ques_para = " ".join([para_sent_list[i] for i in sent_used]) else: ques_para = " ".join(para_sent_list) para_token = tokenizer.tokenize(ques_para)[: max_sequence_length - 2] question = ques["question"] question_token = tokenizer.tokenize(question)[: max_sequence_length - 2] for ans in ques["answers"]: aid = ans["idx"] answer = ans["text"] answer_token = tokenizer.tokenize(answer)[: max_sequence_length - 2] # Generate tokens tokens = ( para_token + ["[SEP]"] + question_token + answer_token + ["[SEP]"] + ["[CLS]"] ) # No token segments token_segments = [0] * (len(para_token) + 2) + [0] * ( len(question_token) + len(answer_token) + 1 ) token_ids = tokenizer.convert_tokens_to_ids(tokens) token_masks = [1] * len(token_ids) if len(tokens) > max_len: max_len = len(tokens) # Add to list paras.append(para) questions.append(question) answers.append(answer) label = ans["isAnswer"] if "isAnswer" in ans else False labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label]) pids.append(pid) qids.append(qid) aids.append(aid) uids.append(f"{pid}%%{qid}%%{aid}") bert_tokens.append(" ".join(tokens)) bert_token_ids.append(torch.LongTensor(token_ids)) bert_token_masks.append(torch.LongTensor(token_masks)) bert_token_segments.append(torch.LongTensor(token_segments)) labels = torch.from_numpy(np.array(labels)) logger.info(f"Max token len {max_len}") return EmmentalDataset( name="SuperGLUE", uid=uid, X_dict={ "uids": uids, "pids": pids, "qids": qids, "aids": aids, "paras": paras, "questions": questions, "answers": answers, "tokens": bert_tokens, "token_ids": bert_token_ids, "token_masks": bert_token_masks, "token_segments": bert_token_segments, }, Y_dict={"labels": labels}, )
def parse(jsonl_path, tokenizer, uid, max_data_samples, max_sequence_length): logger.info(f"Loading data from {jsonl_path}.") rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")] for i in range(2): logger.info(f"Sample {i}: {rows[i]}") # Truncate to max_data_samples if max_data_samples: rows = rows[:max_data_samples] logger.info(f"Truncating to {max_data_samples} samples.") # unique ids uids = [] # sentence text sentences = [] # span1 span1s = [] # span2 span2s = [] # span1 idx span1_idxs = [] # span2 idx span2_idxs = [] # label labels = [] token1_idxs = [] token2_idxs = [] bert_tokens = [] bert_token_ids = [] bert_token_masks = [] bert_token_segments = [] # Check the maximum token length max_len = -1 for row in rows: index = row["idx"] text = row["text"] span1_text = row["target"]["span1_text"] span2_text = row["target"]["span2_text"] span1_index = row["target"]["span1_index"] span2_index = row["target"]["span2_index"] label = row["label"] if "label" in row else True span1_char_index = get_char_index(text, span1_text, span1_index) span2_char_index = get_char_index(text, span2_text, span2_index) assert span1_char_index is not None, f"Check example {id} in {jsonl_path}" assert span2_char_index is not None, f"Check example {id} in {jsonl_path}" # Tokenize sentences bert_tokens_sub1 = tokenizer.tokenize( text[: min(span1_char_index[0], span2_char_index[0])] ) if span1_char_index[0] < span2_char_index[0]: bert_tokens_sub2 = tokenizer.tokenize( text[span1_char_index[0] : span1_char_index[1]] ) token1_idx = [ len(bert_tokens_sub1) + 1, len(bert_tokens_sub1) + len(bert_tokens_sub2), ] else: bert_tokens_sub2 = tokenizer.tokenize( text[span2_char_index[0] : span2_char_index[1]] ) token2_idx = [ len(bert_tokens_sub1) + 1, len(bert_tokens_sub1) + len(bert_tokens_sub2), ] sub3_st = ( span1_char_index[1] if span1_char_index[0] < span2_char_index[0] else span2_char_index[1] ) sub3_ed = ( span1_char_index[0] if span1_char_index[0] > span2_char_index[0] else span2_char_index[0] ) bert_tokens_sub3 = tokenizer.tokenize(text[sub3_st:sub3_ed]) if span1_char_index[0] < span2_char_index[0]: bert_tokens_sub4 = tokenizer.tokenize( text[span2_char_index[0] : span2_char_index[1]] ) cur_len = ( len(bert_tokens_sub1) + len(bert_tokens_sub2) + len(bert_tokens_sub3) ) token2_idx = [cur_len + 1, cur_len + len(bert_tokens_sub4)] else: bert_tokens_sub4 = tokenizer.tokenize( text[span1_char_index[0] : span1_char_index[1]] ) cur_len = ( len(bert_tokens_sub1) + len(bert_tokens_sub2) + len(bert_tokens_sub3) ) token1_idx = [cur_len + 1, cur_len + len(bert_tokens_sub4)] if span1_char_index[0] < span2_char_index[0]: bert_tokens_sub5 = tokenizer.tokenize(text[span2_char_index[1] :]) else: bert_tokens_sub5 = tokenizer.tokenize(text[span1_char_index[1] :]) tokens = ( ["[CLS]"] + bert_tokens_sub1 + bert_tokens_sub2 + bert_tokens_sub3 + bert_tokens_sub4 + bert_tokens_sub5 + ["[SEP]"] ) if len(tokens) > max_len: max_len = len(tokens) token_ids = tokenizer.convert_tokens_to_ids(tokens) token_segments = [0] * len(token_ids) # Generate mask where 1 for real tokens and 0 for padding tokens token_masks = [1] * len(token_ids) token1_idxs.append(token1_idx) token2_idxs.append(token2_idx) uids.append(index) sentences.append(text) span1s.append(span1_text) span2s.append(span2_text) span1_idxs.append(span1_index) span2_idxs.append(span2_index) labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label]) bert_tokens.append(tokens) bert_token_ids.append(torch.LongTensor(token_ids)) bert_token_masks.append(torch.LongTensor(token_masks)) bert_token_segments.append(torch.LongTensor(token_segments)) token1_idxs = torch.from_numpy(np.array(token1_idxs)) token2_idxs = torch.from_numpy(np.array(token2_idxs)) labels = torch.from_numpy(np.array(labels)) logger.info(f"Max token len {max_len}") return EmmentalDataset( name="SuperGLUE", uid="uids", X_dict={ "uids": uids, "sentence": sentences, "span1": span1s, "span2": span2s, "span1_idx": span1_idxs, "span2_idx": span2_idxs, "token1_idx": token1_idxs, "token2_idx": token2_idxs, "tokens": bert_tokens, "token_ids": bert_token_ids, "token_masks": bert_token_masks, "token_segments": bert_token_segments, }, Y_dict={"labels": labels}, )