예제 #1
0
    def __init__(self, dataset_dir, split_str, labels_path, transform_fns):
        """
        """
        split_path = path.join(dataset_dir, f'{split_str}.csv')
        self.split_df = pd.read_csv(split_path, index_col=0)
        self.split_str = split_str
        self.labels_df = pd.read_csv(labels_path, index_col=0, header=[0, 1])
        self.exam_ids = list(self.split_df.index.unique())

        self.transform_fns = transform_fns
        self.shuffle_transform = 'shuffle' in [f['fn'] for f in transform_fns]

        self.instance_transform = None
        for f in transform_fns:
            # only extract instances if asked to do so and specified for split
            if 'extract_instance' == f['fn'] and split_str in f['args']['splits']:
                self.instance_transform = f['args']
                logger.info(f"using instance extraction on {f['args']['splits']} splits")
                break
        if self.instance_transform != None and self.instance_transform.get('instance_only', False):
            # only access exam_ids with instance level labels
            exam_ids = []
            for exam_id in self.exam_ids:
                rows = self.split_df.loc[exam_id]
                if isinstance(rows, pd.Series):
                    if not np.isnan(rows['label.lv']):
                        exam_ids.append(exam_id)
                else:
                    if not np.isnan(rows.iloc[0]['label.lv']):
                        exam_ids.append(exam_id)
            logger.info(f'using {len(exam_ids)} of {len(self.exam_ids)} exam_ids')
            self.exam_ids = exam_ids
        else:
            logger.info(f'using {len(self.exam_ids)} exam_ids')

        X_dict = {'exam_ids': []}
        Y_dict = {
            'primary':  [],
            'primary_multiclass': [],
            '2normal_binary': []        # labels: control, 1,2 (normal) | 3,4 (abnormal),
        }

        for idx, exam_id in enumerate(self.exam_ids):
            X_dict['exam_ids'].append(exam_id)

            y_dict = self.get_y(exam_id)
            for t, label in y_dict.items():
                Y_dict[t].append(label)

        Y_dict = {k: torch.from_numpy(np.array(v)) for k, v in Y_dict.items()}
        EmmentalDataset.__init__(self, 'cow-tus-dataset', X_dict=X_dict, Y_dict=Y_dict)
예제 #2
0
def create_dataloaders(task_name, dataset, batch_size, word2id, oov="~#OoV#~"):
    # Create dataloaders
    oov_id = word2id[oov]
    dataloaders = []

    for split in ["train", "valid", "test"]:
        split_x, split_y = dataset[split]
        split_x = [
            torch.LongTensor([word2id.get(w, oov_id) for w in seq])
            for seq in split_x
        ]

        dataloaders.append(
            EmmentalDataLoader(
                task_to_label_dict={task_name: "label"},
                dataset=EmmentalDataset(
                    name=task_name,
                    X_dict={"feature": split_x},
                    Y_dict={"label": split_y},
                ),
                split=split,
                batch_size=batch_size,
                shuffle=True if split == "train" else False,
            ))
        logger.info(
            f"Loaded {split} for {task_name} containing {len(split_x)} samples."
        )

    return dataloaders
예제 #3
0
def test_mixed_scheduler(caplog):
    """Unit test of mixed scheduler"""

    caplog.set_level(logging.INFO)

    emmental.Meta.init()

    task1 = "task1"
    x1 = np.random.rand(20, 2)
    y1 = torch.from_numpy(np.random.rand(20))

    task2 = "task2"
    x2 = np.random.rand(30, 3)
    y2 = torch.from_numpy(np.random.rand(30))

    dataloaders = [
        EmmentalDataLoader(
            task_to_label_dict={task_name: "label"},
            dataset=EmmentalDataset(name=task_name,
                                    X_dict={"feature": x},
                                    Y_dict={"label": y}),
            split="train",
            batch_size=10,
            shuffle=True,
        ) for task_name, x, y in [(task1, x1, y1), (task2, x2, y2)]
    ]

    scheduler = MixedScheduler()

    assert scheduler.get_num_batches(dataloaders) == 2

    batch_task_names_1 = [
        batch_data[0][-2] for batch_data in scheduler.get_batches(dataloaders)
    ]
    batch_task_names_2 = [
        batch_data[1][-2] for batch_data in scheduler.get_batches(dataloaders)
    ]

    assert batch_task_names_1 == [task1, task1]
    assert batch_task_names_2 == [task2, task2]

    scheduler = MixedScheduler(fillup=True)

    assert scheduler.get_num_batches(dataloaders) == 3

    batch_task_names_1 = [
        batch_data[0][-2] for batch_data in scheduler.get_batches(dataloaders)
    ]
    batch_task_names_2 = [
        batch_data[1][-2] for batch_data in scheduler.get_batches(dataloaders)
    ]

    assert batch_task_names_1 == [task1, task1, task1]
    assert batch_task_names_2 == [task2, task2, task2]
예제 #4
0
def test_emmental_dataset(caplog):
    """Unit test of emmental dataset"""

    caplog.set_level(logging.INFO)

    x1 = [
        torch.Tensor([1]),
        torch.Tensor([1, 2]),
        torch.Tensor([1, 2, 3]),
        torch.Tensor([1, 2, 3, 4]),
        torch.Tensor([1, 2, 3, 4, 5]),
    ]

    y1 = torch.Tensor([0, 0, 0, 0, 0])

    dataset = EmmentalDataset(X_dict={"data1": x1},
                              Y_dict={"label1": y1},
                              name="new_data")

    # Check if the dataset is correctly constructed
    assert torch.equal(dataset[0][0]["data1"], x1[0])
    assert torch.equal(dataset[0][1]["label1"], y1[0])

    x2 = [
        torch.Tensor([1, 2, 3, 4, 5]),
        torch.Tensor([1, 2, 3, 4]),
        torch.Tensor([1, 2, 3]),
        torch.Tensor([1, 2]),
        torch.Tensor([1]),
    ]

    dataset.add_features(X_dict={"data2": x2})

    # Check add one more feature to dataset
    assert torch.equal(dataset[0][0]["data2"], x2[0])

    y2 = torch.Tensor([1, 1, 1, 1, 1])

    dataset.add_labels(Y_dict={"label2": y2})

    # Check add one more label to dataset
    assert torch.equal(dataset[0][1]["label2"], y2[0])

    dataset.remove_label(label_name="label1")

    # Check remove one more label to dataset
    assert "label1" not in dataset.Y_dict
def test_round_robin_scheduler(caplog):
    """Unit test of round robin scheduler."""
    caplog.set_level(logging.INFO)

    emmental.Meta.init()

    # Set random seed seed
    set_random_seed(2)

    task1 = "task1"
    x1 = np.random.rand(20, 2)
    y1 = torch.from_numpy(np.random.rand(20))

    task2 = "task2"
    x2 = np.random.rand(30, 3)
    y2 = torch.from_numpy(np.random.rand(30))

    dataloaders = [
        EmmentalDataLoader(
            task_to_label_dict={task_name: "label"},
            dataset=EmmentalDataset(
                name=task_name, X_dict={"feature": x}, Y_dict={"label": y}
            ),
            split="train",
            batch_size=10,
            shuffle=True,
        )
        for task_name, x, y in [(task1, x1, y1), (task2, x2, y2)]
    ]

    scheduler = RoundRobinScheduler()

    assert scheduler.get_num_batches(dataloaders) == 5

    batch_task_names = [
        batch_data[-2] for batch_data in scheduler.get_batches(dataloaders)
    ]

    assert batch_task_names == [task2, task1, task2, task2, task1]

    scheduler = RoundRobinScheduler(fillup=True)

    assert scheduler.get_num_batches(dataloaders) == 6

    batch_task_names = [
        batch_data[-2] for batch_data in scheduler.get_batches(dataloaders)
    ]

    assert batch_task_names == [task2, task1, task2, task2, task1, task1]
예제 #6
0
    def __init__(self, dataset_dir, split_str, labels_path, transform_fns):
        """
        """
        split_path = path.join(dataset_dir, f'{split_str}.csv')
        self.split_df = pd.read_csv(split_path, index_col=0)
        self.labels_df = pd.read_csv(labels_path, index_col=0, header=[0, 1])
        self.loop_idxs = range(len(self.split_df))

        # use df.iloc because loop_ids are not unique...
        loop_idxs = []
        for loop_idx in self.loop_idxs:
            row = self.split_df.iloc[loop_idx]
            loop_type = row['exdir.loop_type']
            if f'label.{loop_type}' in row.keys() and not np.isnan(row[f'label.{loop_type}']):
                loop_idxs.append(loop_idx)
        logger.info(f'using {len(loop_idxs)} of {len(self.loop_idxs)} loop_idxs')
        self.loop_idxs = loop_idxs

        self.transform_fns = transform_fns

        X_dict = {'loop_idxs': []}
        Y_dict = {
            'primary':  [],
            'primary_multiclass': [],
            '2normal_binary': []        # labels: control, 1,2 (normal) | 3,4 (abnormal),
        }

        for idx, loop_idx in enumerate(self.loop_idxs):
            X_dict['loop_idxs'].append(loop_idx)

            y_dict = self.get_y(loop_idx)
            for t, label in y_dict.items():
                Y_dict[t].append(label)

        Y_dict = {k: torch.from_numpy(np.array(v)) for k, v in Y_dict.items()}
        EmmentalDataset.__init__(self, 'cow-tus-dataset', X_dict=X_dict, Y_dict=Y_dict)
예제 #7
0
 def wrapped_f(dataset):
     X_dict = defaultdict(list)
     Y_dict = defaultdict(list)
     examples = []
     for x_dict, y_dict in dataset:
         # TODO: Consider making sure aug_x_dict is not x_dict!
         aug_x_dict, aug_y_dict = f(x_dict, y_dict)
         if aug_x_dict is not None and aug_y_dict is not None:
             examples.append((aug_x_dict, aug_y_dict))
     for x_dict, y_dict in examples:
         for k, v in x_dict.items():
             X_dict[k].append(v)
         for k, v in y_dict.items():
             Y_dict[k].append(v)
     for k, v in Y_dict.items():
         Y_dict[k] = list_to_tensor(v)
     # X_dict, Y_dict = emmental_collate_fn(examples)
     aug_dataset = EmmentalDataset(name=f.__name__,
                                   X_dict=X_dict,
                                   Y_dict=Y_dict)
     logger.info(
         f"Total {len(aug_dataset)} augmented examples were created "
         f"from AF {f.__name__}")
     return aug_dataset
예제 #8
0
def test_emmental_dataloader(caplog):
    """Unit test of emmental dataloader."""
    caplog.set_level(logging.INFO)

    x1 = [
        torch.Tensor([1]),
        torch.Tensor([1, 2]),
        torch.Tensor([1, 2, 3]),
        torch.Tensor([1, 2, 3, 4]),
        torch.Tensor([1, 2, 3, 4, 5]),
    ]

    y1 = torch.Tensor([0, 0, 0, 0, 0])

    x2 = [
        torch.Tensor([1, 2, 3, 4, 5]),
        torch.Tensor([1, 2, 3, 4]),
        torch.Tensor([1, 2, 3]),
        torch.Tensor([1, 2]),
        torch.Tensor([1]),
    ]

    y2 = torch.Tensor([1, 1, 1, 1, 1])

    dataset = EmmentalDataset(
        X_dict={"data1": x1, "data2": x2},
        Y_dict={"label1": y1, "label2": y2},
        name="new_data",
    )

    dataloader1 = EmmentalDataLoader(
        task_to_label_dict={"task1": "label1"},
        dataset=dataset,
        split="train",
        batch_size=2,
    )

    x_batch, y_batch = next(iter(dataloader1))

    # Check if the dataloader is correctly constructed
    assert dataloader1.task_to_label_dict == {"task1": "label1"}
    assert dataloader1.split == "train"
    assert torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]]))
    assert torch.equal(
        x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]])
    )
    assert torch.equal(y_batch["label1"], torch.Tensor([0, 0]))
    assert torch.equal(y_batch["label2"], torch.Tensor([1, 1]))

    dataloader2 = EmmentalDataLoader(
        task_to_label_dict={"task2": "label2"},
        dataset=dataset,
        split="test",
        batch_size=3,
    )

    x_batch, y_batch = next(iter(dataloader2))

    # Check if the dataloader with differet batch size is correctly constructed
    assert dataloader2.task_to_label_dict == {"task2": "label2"}
    assert dataloader2.split == "test"
    assert torch.equal(
        x_batch["data1"], torch.Tensor([[1, 0, 0], [1, 2, 0], [1, 2, 3]])
    )
    assert torch.equal(
        x_batch["data2"],
        torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]),
    )
    assert torch.equal(y_batch["label1"], torch.Tensor([0, 0, 0]))
    assert torch.equal(y_batch["label2"], torch.Tensor([1, 1, 1]))

    y3 = [
        torch.Tensor([2]),
        torch.Tensor([2]),
        torch.Tensor([2]),
        torch.Tensor([2]),
        torch.Tensor([2]),
    ]

    dataset.Y_dict["label2"] = y3

    x_batch, y_batch = next(iter(dataloader1))
    # Check dataloader is correctly updated with update dataset
    assert torch.equal(
        x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]])
    )
    assert torch.equal(y_batch["label2"], torch.Tensor([[2], [2]]))

    x_batch, y_batch = next(iter(dataloader2))
    assert torch.equal(
        x_batch["data2"],
        torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]),
    )
    assert torch.equal(y_batch["label2"], torch.Tensor([[2], [2], [2]]))
예제 #9
0
def load_data_from_db(postgres_db_name,
                      postgres_db_location,
                      label_dict,
                      char_dict=None,
                      clobber_label=True):
    """Load data from database.
    """

    print(f"Loading data from db {postgres_db_name}")
    # Start DB connection
    conn_string = os.path.join(postgres_db_location, postgres_db_name)
    session = Meta.init(conn_string).Session()

    # Printing number of docs/sentences
    print("==============================")
    print(f"DB contents for {postgres_db_name}:")
    print(f"Number of documents: {session.query(Document).count()}")
    print("==============================")

    docs = session.query(Document).all()

    uid_field = []
    text_field = []
    label_field = []
    missed_ids = 0

    term = r"([Ll]ocation:[\w\W]{1,200}</.{0,20}>|\W[cC]ity:[\w\W]{1,200}</.{0,20}>|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)"

    for doc in docs:
        if (doc.name in label_dict) or clobber_label:
            uid_field.append(doc.name)
            text_field.append(get_posting_html_fast(doc.text, term))
            if not clobber_label:
                label_field.append(label_dict[doc.name])
            else:
                label_field.append(-1)
        else:
            missed_ids += 1

    # Printing data stats
    print("==============================")
    print(f"Loaded {len(uid_field)} ids")
    print(f"Loaded {len(text_field)} text")
    print(f"Loaded {len(label_field)} labels")
    print(f"Missed {missed_ids} samples")

    X_dict = {"text": text_field, "uid": uid_field}
    Y_dict = {"label": torch.from_numpy(np.array(label_field))}

    dataset = EmmentalDataset(name="HT",
                              X_dict=X_dict,
                              Y_dict=Y_dict,
                              uid="uid")

    emb_field = []
    for i in range(len(dataset)):
        emb_field.append(
            torch.from_numpy(
                np.array(list(map(char_dict.lookup, dataset[i][0]['text'])))))
    dataset.add_features({"emb": emb_field})
    return dataset
예제 #10
0
파일: test_data.py 프로젝트: SenWu/emmental
def test_emmental_dataset(caplog):
    """Unit test of emmental dataset."""
    caplog.set_level(logging.INFO)

    dirpath = "temp_test_data"

    Meta.reset()
    emmental.init(dirpath)

    x1 = [
        torch.Tensor([1]),
        torch.Tensor([1, 2]),
        torch.Tensor([1, 2, 3]),
        torch.Tensor([1, 2, 3, 4]),
        torch.Tensor([1, 2, 3, 4, 5]),
    ]

    y1 = torch.Tensor([0, 0, 0, 0, 0])

    dataset = EmmentalDataset(X_dict={"data1": x1},
                              Y_dict={"label1": y1},
                              name="new_data")

    # Check if the dataset is correctly constructed
    assert torch.equal(dataset[0][0]["data1"], x1[0])
    assert torch.equal(dataset[0][1]["label1"], y1[0])

    x2 = [
        torch.Tensor([1, 2, 3, 4, 5]),
        torch.Tensor([1, 2, 3, 4]),
        torch.Tensor([1, 2, 3]),
        torch.Tensor([1, 2]),
        torch.Tensor([1]),
    ]

    dataset.add_features(X_dict={"data2": x2})

    dataset.remove_feature("data2")
    assert "data2" not in dataset.X_dict

    dataset.add_features(X_dict={"data2": x2})

    # Check add one more feature to dataset
    assert torch.equal(dataset[0][0]["data2"], x2[0])

    y2 = torch.Tensor([1, 1, 1, 1, 1])

    dataset.add_labels(Y_dict={"label2": y2})

    with pytest.raises(ValueError):
        dataset.add_labels(Y_dict={"label2": x2})

    # Check add one more label to dataset
    assert torch.equal(dataset[0][1]["label2"], y2[0])

    dataset.remove_label(label_name="label1")

    # Check remove one more label to dataset
    assert "label1" not in dataset.Y_dict

    with pytest.raises(ValueError):
        dataset = EmmentalDataset(X_dict={"data1": x1},
                                  Y_dict={"label1": y1},
                                  name="new_data",
                                  uid="ids")

    dataset = EmmentalDataset(X_dict={"_uids_": x1},
                              Y_dict={"label1": y1},
                              name="new_data")

    dataset = EmmentalDataset(X_dict={"data1": x1}, name="new_data")

    # Check if the dataset is correctly constructed
    assert torch.equal(dataset[0]["data1"], x1[0])

    dataset.add_features(X_dict={"data2": x2})

    dataset.remove_feature("data2")
    assert "data2" not in dataset.X_dict

    dataset.add_features(X_dict={"data2": x2})

    # Check add one more feature to dataset
    assert torch.equal(dataset[0]["data2"], x2[0])

    y2 = torch.Tensor([1, 1, 1, 1, 1])

    dataset.add_labels(Y_dict={"label2": y2})

    # Check add one more label to dataset
    assert torch.equal(dataset[0][1]["label2"], y2[0])

    shutil.rmtree(dirpath)
예제 #11
0
def parse(jsonl_path, tokenizer, uid, max_data_samples, max_sequence_length):
    logger.info(f"Loading data from {jsonl_path}.")
    rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")]
    for i in range(2):
        logger.info(f"Sample {i}: {rows[i]}")

    # Truncate to max_data_samples
    if max_data_samples:
        rows = rows[:max_data_samples]
        logger.info(f"Truncating to {max_data_samples} samples.")

    # unique ids
    uids = []

    # sentence1 text
    sentence1s = []
    # sentence2 text
    sentence2s = []
    # label
    labels = []

    bert_token_ids = []
    bert_token_masks = []
    bert_token_segments = []

    # Check the maximum token length
    max_len = -1

    for row in rows:
        index = row["idx"]
        sentence1 = row["premise"]
        sentence2 = row["hypothesis"]
        label = row["label"] if "label" in row else "entailment"

        uids.append(index)
        sentence1s.append(sentence1)
        sentence2s.append(sentence2)
        labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label])

        # Tokenize sentences
        sent1_tokens = tokenizer.tokenize(sentence1)
        sent2_tokens = tokenizer.tokenize(sentence2)

        if len(sent1_tokens) + len(sent2_tokens) > max_len:
            max_len = len(sent1_tokens) + len(sent2_tokens)

        while True:
            total_length = len(sent1_tokens) + len(sent2_tokens)
            # Account for [CLS], [SEP], [SEP] with "- 3"
            if total_length <= max_sequence_length - 3:
                break
            if len(sent1_tokens) > len(sent2_tokens):
                sent1_tokens.pop()
            else:
                sent2_tokens.pop()

        # Convert to BERT manner
        tokens = ["[CLS]"] + sent1_tokens + ["[SEP]"]
        token_segments = [0] * len(tokens)

        tokens += sent2_tokens + ["[SEP]"]
        token_segments += [1] * (len(sent2_tokens) + 1)

        token_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Generate mask where 1 for real tokens and 0 for padding tokens
        token_masks = [1] * len(token_ids)

        bert_token_ids.append(torch.LongTensor(token_ids))
        bert_token_masks.append(torch.LongTensor(token_masks))
        bert_token_segments.append(torch.LongTensor(token_segments))

    labels = torch.from_numpy(np.array(labels))

    logger.info(f"Max token len {max_len}")

    return EmmentalDataset(
        name="SuperGLUE",
        uid="uids",
        X_dict={
            "uids": uids,
            "sentence1": sentence1s,
            "sentence2": sentence2s,
            "token_ids": bert_token_ids,
            "token_masks": bert_token_masks,
            "token_segments": bert_token_segments,
        },
        Y_dict={"labels": labels},
    )
예제 #12
0
def parse(jsonl_path, tokenizer, uid, max_data_samples, max_sequence_length):
    logger.info(f"Loading data from {jsonl_path}.")
    rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")]
    for i in range(2):
        logger.info(f"Sample {i}: {rows[i]}")

    # Truncate to max_data_samples
    if max_data_samples:
        rows = rows[:max_data_samples]
        logger.info(f"Truncating to {max_data_samples} samples.")

    # unique ids
    uids = []
    # sentence1
    sent1s = []
    # sentence2
    sent2s = []
    # choice1
    choice1s = []
    # choice2
    choice2s = []

    labels = []

    bert_token1_ids = []
    bert_token2_ids = []

    bert_token1_masks = []
    bert_token2_masks = []

    bert_token1_segments = []
    bert_token2_segments = []

    # Check the maximum token length
    max_len = -1

    for sample in rows:
        index = sample["idx"]
        sent1 = sample["premise"]
        sent2 = sample["question"]

        sent2 = ("What was the cause of this?"
                 if sent2 == "cause" else "What happened as a result?")

        choice1 = sample["choice1"]
        choice2 = sample["choice2"]
        label = sample["label"] if "label" in sample else True
        uids.append(index)
        sent1s.append(sent1)
        sent2s.append(sent2)
        choice1s.append(choice1)
        choice2s.append(choice2)
        labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label])

        # Tokenize sentences
        sent1_tokens = tokenizer.tokenize(sent1)
        sent2_tokens = tokenizer.tokenize(sent2)

        # Tokenize choices
        choice1_tokens = tokenizer.tokenize(choice1)
        choice2_tokens = tokenizer.tokenize(choice2)

        # Convert to BERT manner
        tokens1 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens +
                   ["[SEP]"] + choice1_tokens + ["[SEP]"])
        tokens2 = (["[CLS]"] + sent1_tokens + ["[SEP]"] + sent2_tokens +
                   ["[SEP]"] + choice2_tokens + ["[SEP]"])

        token1_ids = tokenizer.convert_tokens_to_ids(tokens1)
        token2_ids = tokenizer.convert_tokens_to_ids(tokens2)

        padding1 = [0] * (max_sequence_length - len(token1_ids))
        padding2 = [0] * (max_sequence_length - len(token2_ids))

        token1_masks = [1] * len(token1_ids)
        token2_masks = [1] * len(token2_ids)

        token1_segments = [0] * len(token1_ids)
        token2_segments = [0] * len(token2_ids)

        token1_ids += padding1
        token2_ids += padding2

        token1_masks += padding1
        token2_masks += padding2

        token1_segments += padding1
        token2_segments += padding2

        if len(token1_ids) > max_len:
            max_len = len(token1_ids)
        if len(token2_ids) > max_len:
            max_len = len(token2_ids)

        bert_token1_ids.append(torch.LongTensor(token1_ids))
        bert_token2_ids.append(torch.LongTensor(token2_ids))

        bert_token1_masks.append(torch.LongTensor(token1_masks))
        bert_token2_masks.append(torch.LongTensor(token2_masks))

        bert_token1_segments.append(torch.LongTensor(token1_segments))
        bert_token2_segments.append(torch.LongTensor(token2_segments))

    labels = torch.from_numpy(np.array(labels))

    logger.info(f"Max token len {max_len}")

    return EmmentalDataset(
        name="SuperGLUE",
        uid="uids",
        X_dict={
            "uids": uids,
            "sentence1": sent1s,
            "sentence2": sent2s,
            "choice1": choice1s,
            "choice2": choice2s,
            "token1_ids": bert_token1_ids,
            "token2_ids": bert_token2_ids,
            "token1_masks": bert_token1_masks,
            "token2_masks": bert_token2_masks,
            "token1_segments": bert_token1_segments,
            "token2_segments": bert_token2_segments,
        },
        Y_dict={"labels": labels},
    )
예제 #13
0
파일: test_data.py 프로젝트: SenWu/emmental
def test_emmental_dataloader(caplog):
    """Unit test of emmental dataloader."""
    caplog.set_level(logging.INFO)

    dirpath = "temp_test_data"

    Meta.reset()
    emmental.init(dirpath)

    x1 = [
        torch.Tensor([1]),
        torch.Tensor([1, 2]),
        torch.Tensor([1, 2, 3]),
        torch.Tensor([1, 2, 3, 4]),
        torch.Tensor([1, 2, 3, 4, 5]),
    ]

    y1 = torch.Tensor([0, 0, 0, 0, 0])

    x2 = [
        torch.Tensor([1, 2, 3, 4, 5]),
        torch.Tensor([1, 2, 3, 4]),
        torch.Tensor([1, 2, 3]),
        torch.Tensor([1, 2]),
        torch.Tensor([1]),
    ]

    y2 = torch.Tensor([1, 1, 1, 1, 1])

    dataset = EmmentalDataset(
        X_dict={
            "data1": x1,
            "data2": x2
        },
        Y_dict={
            "label1": y1,
            "label2": y2
        },
        name="new_data",
    )

    dataloader1 = EmmentalDataLoader(
        task_to_label_dict={"task1": "label1"},
        dataset=dataset,
        split="train",
        batch_size=2,
        num_workers=2,
    )

    x_batch, y_batch = next(iter(dataloader1))

    # Check if the dataloader is correctly constructed
    assert dataloader1.task_to_label_dict == {"task1": "label1"}
    assert dataloader1.split == "train"
    assert torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]]))
    assert torch.equal(x_batch["data2"],
                       torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]))
    assert torch.equal(y_batch["label1"], torch.Tensor([0, 0]))
    assert torch.equal(y_batch["label2"], torch.Tensor([1, 1]))

    dataloader2 = EmmentalDataLoader(
        task_to_label_dict={"task2": "label2"},
        dataset=dataset,
        split="test",
        batch_size=3,
        collate_fn=partial(emmental_collate_fn, min_data_len=0,
                           max_data_len=0),
    )

    x_batch, y_batch = next(iter(dataloader2))

    # Check if the dataloader with different batch size is correctly constructed
    assert dataloader2.task_to_label_dict == {"task2": "label2"}
    assert dataloader2.split == "test"
    assert torch.equal(x_batch["data1"],
                       torch.Tensor([[1, 0, 0], [1, 2, 0], [1, 2, 3]]))
    assert torch.equal(
        x_batch["data2"],
        torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]),
    )
    assert torch.equal(y_batch["label1"], torch.Tensor([0, 0, 0]))
    assert torch.equal(y_batch["label2"], torch.Tensor([1, 1, 1]))

    y3 = [
        torch.Tensor([2]),
        torch.Tensor([2]),
        torch.Tensor([2]),
        torch.Tensor([2]),
        torch.Tensor([2]),
    ]

    dataset.Y_dict["label2"] = y3

    x_batch, y_batch = next(iter(dataloader1))
    # Check dataloader is correctly updated with update dataset
    assert torch.equal(x_batch["data2"],
                       torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]))
    assert torch.equal(y_batch["label2"], torch.Tensor([[2], [2]]))

    x_batch, y_batch = next(iter(dataloader2))
    assert torch.equal(
        x_batch["data2"],
        torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]),
    )
    assert torch.equal(y_batch["label2"], torch.Tensor([[2], [2], [2]]))

    dataset = EmmentalDataset(X_dict={"data1": x1}, name="new_data")

    dataloader3 = EmmentalDataLoader(task_to_label_dict={"task1": None},
                                     dataset=dataset,
                                     split="train",
                                     batch_size=2)

    x_batch = next(iter(dataloader3))

    # Check if the dataloader is correctly constructed
    assert dataloader3.task_to_label_dict == {"task1": None}
    assert dataloader3.split == "train"
    assert torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]]))

    # Check there is an error if task_to_label_dict has task to label mapping while
    # no y_dict in dataset
    with pytest.raises(ValueError):
        EmmentalDataLoader(
            task_to_label_dict={"task1": "label1"},
            dataset=dataset,
            split="train",
            batch_size=2,
        )

    shutil.rmtree(dirpath)
예제 #14
0
def parse(jsonl_path, tokenizer, uid, max_data_samples, max_sequence_length):
    logger.info(f"Loading data from {jsonl_path}.")
    rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")]
    for i in range(2):
        logger.info(f"Sample {i}: {rows[i]}")

    # Truncate to max_data_samples
    if max_data_samples:
        rows = rows[:max_data_samples]
        logger.info(f"Truncating to {max_data_samples} samples.")

    # unique ids
    uids = []

    # sentence1 text
    sentence1s = []
    # sentence2 text
    sentence2s = []
    # sentence1 idx
    sentence1_idxs = []
    # sentence2 idx
    sentence2_idxs = []
    # word in common
    words = []
    # pos tag
    poses = []
    # label
    labels = []

    token1_idxs = []
    token2_idxs = []

    bert_token_ids = []
    bert_token_masks = []
    bert_token_segments = []

    # Check the maximum token length
    max_len = -1

    for idx, row in enumerate(rows):

        sentence1 = row["sentence1"]
        sentence2 = row["sentence2"]
        word = row["word"]
        pos = row["pos"]
        sentence1_idx = int(row["sentence1_idx"])
        sentence2_idx = int(row["sentence2_idx"])
        label = row["label"] if "label" in row else True

        uids.append(idx)
        sentence1s.append(sentence1)
        sentence2s.append(sentence2)
        sentence1_idxs.append(sentence1_idx)
        sentence2_idxs.append(sentence2_idx)
        words.append(word)
        poses.append(pos)
        labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label])

        # Tokenize sentences
        sent1_tokens = tokenizer.tokenize(sentence1)
        sent2_tokens = tokenizer.tokenize(sentence2)

        word_tokens_in_sent1 = tokenizer.tokenize(
            sentence1.split()[sentence1_idx])
        word_tokens_in_sent2 = tokenizer.tokenize(
            sentence2.split()[sentence2_idx])

        while True:
            total_length = len(sent1_tokens) + len(sent2_tokens)
            if total_length > max_len:
                max_len = total_length
            # Account for [CLS], [SEP], [SEP] with "- 3"
            if total_length <= max_sequence_length - 3:
                break
            if len(sent1_tokens) > len(sent2_tokens):
                sent1_tokens.pop()
            else:
                sent2_tokens.pop()

        for idx in range(sentence1_idx - 1, len(sent1_tokens)):
            if (sent1_tokens[idx:idx + len(word_tokens_in_sent1)] ==
                    word_tokens_in_sent1):
                token1_idxs.append(idx + 2)  # Add [CLS]
                break

        for idx in range(sentence2_idx - 1, len(sent2_tokens)):
            if (sent2_tokens[idx:idx + len(word_tokens_in_sent2)] ==
                    word_tokens_in_sent2):
                token2_idxs.append(
                    idx + len(sent1_tokens) + 1
                )  # Add the length of the first sentence and [CLS] + [SEP]
                break

        # Convert to BERT manner
        tokens = sent1_tokens + ["[SEP]"]
        token_segments = [0] * len(tokens)

        tokens += sent2_tokens + ["[SEP]"] + ["[CLS]"]
        token_segments += [1] * (len(sent2_tokens) + 1)

        token_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Generate mask where 1 for real tokens and 0 for padding tokens
        token_masks = [1] * len(token_ids)

        bert_token_ids.append(torch.LongTensor(token_ids))
        bert_token_masks.append(torch.LongTensor(token_masks))
        bert_token_segments.append(torch.LongTensor(token_segments))

    token1_idxs = torch.from_numpy(np.array(token1_idxs))
    token2_idxs = torch.from_numpy(np.array(token2_idxs))

    labels = torch.from_numpy(np.array(labels))

    logger.info(f"Max token len {max_len}")

    return EmmentalDataset(
        name="SuperGLUE",
        uid="uids",
        X_dict={
            "uids": uids,
            "sentence1": sentence1s,
            "sentence2": sentence2s,
            "word": words,
            "pos": poses,
            "sentence1_idx": sentence1_idxs,
            "sentence2_idx": sentence2_idxs,
            "token1_idx": token1_idxs,
            "token2_idx": token2_idxs,
            "token_ids": bert_token_ids,
            "token_masks": bert_token_masks,
            "token_segments": bert_token_segments,
        },
        Y_dict={"labels": labels},
    )
예제 #15
0
                bert_model_name=args.bert_model,
                max_data_samples=args.max_data_samples,
                max_sequence_length=args.max_sequence_length,
            )
            X_dict = {
                "token_ids": token_ids,
                "token_segments": token_segments,
                "token_masks": token_masks,
            }
            Y_dict = {"labels": labels}

            if task_name not in datasets:
                datasets[task_name] = {}

            datasets[task_name][split] = EmmentalDataset(
                name="GLUE", X_dict=X_dict, Y_dict=Y_dict
            )

            logger.info(f"Loaded {split} for {task_name}.")

    dataloaders = []

    for task_name in args.task:
        for split in ["train", "dev", "test"]:
            dataloaders.append(
                EmmentalDataLoader(
                    task_to_label_dict={task_name: "labels"},
                    dataset=datasets[task_name][split],
                    split=split,
                    batch_size=args.batch_size,
                    shuffle=True if split == "train" else False,
예제 #16
0
def test_e2e(caplog):
    """Run an end-to-end test."""
    caplog.set_level(logging.INFO)

    dirpath = "temp_test_e2e"
    use_exact_log_path = False
    Meta.reset()
    emmental.init(dirpath, use_exact_log_path=use_exact_log_path)

    config = {
        "meta_config": {
            "seed": 0
        },
        "learner_config": {
            "n_epochs": 3,
            "optimizer_config": {
                "lr": 0.01,
                "grad_clip": 100
            },
        },
        "logging_config": {
            "counter_unit": "epoch",
            "evaluation_freq": 1,
            "writer_config": {
                "writer": "tensorboard",
                "verbose": True
            },
            "checkpointing": True,
            "checkpointer_config": {
                "checkpoint_path": None,
                "checkpoint_freq": 1,
                "checkpoint_metric": {
                    "model/all/train/loss": "min"
                },
                "checkpoint_task_metrics": None,
                "checkpoint_runway": 1,
                "checkpoint_all": False,
                "clear_intermediate_checkpoints": True,
                "clear_all_checkpoints": True,
            },
        },
    }
    emmental.Meta.update_config(config)

    # Generate synthetic data
    N = 500
    X = np.random.random((N, 2)) * 2 - 1
    Y1 = (X[:, 0] > X[:, 1] + 0.25).astype(int)
    Y2 = (X[:, 0] > X[:, 1] + 0.2).astype(int)

    X = [torch.Tensor(X[i]) for i in range(N)]
    # Create dataset and dataloader

    X_train, X_dev, X_test = (
        X[:int(0.8 * N)],
        X[int(0.8 * N):int(0.9 * N)],
        X[int(0.9 * N):],
    )
    Y1_train, Y1_dev, Y1_test = (
        torch.tensor(Y1[:int(0.8 * N)]),
        torch.tensor(Y1[int(0.8 * N):int(0.9 * N)]),
        torch.tensor(Y1[int(0.9 * N):]),
    )
    Y2_train, Y2_dev, Y2_test = (
        torch.tensor(Y2[:int(0.8 * N)]),
        torch.tensor(Y2[int(0.8 * N):int(0.9 * N)]),
        torch.tensor(Y2[int(0.9 * N):]),
    )

    train_dataset1 = EmmentalDataset(name="synthetic",
                                     X_dict={"data": X_train},
                                     Y_dict={"label1": Y1_train})

    train_dataset2 = EmmentalDataset(name="synthetic",
                                     X_dict={"data": X_train},
                                     Y_dict={"label2": Y2_train})

    dev_dataset1 = EmmentalDataset(name="synthetic",
                                   X_dict={"data": X_dev},
                                   Y_dict={"label1": Y1_dev})

    dev_dataset2 = EmmentalDataset(name="synthetic",
                                   X_dict={"data": X_dev},
                                   Y_dict={"label2": Y2_dev})

    test_dataset1 = EmmentalDataset(name="synthetic",
                                    X_dict={"data": X_test},
                                    Y_dict={"label1": Y1_test})

    test_dataset2 = EmmentalDataset(name="synthetic",
                                    X_dict={"data": X_test},
                                    Y_dict={"label2": Y2_test})

    task_to_label_dict = {"task1": "label1"}

    train_dataloader1 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=train_dataset1,
        split="train",
        batch_size=10,
    )
    dev_dataloader1 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=dev_dataset1,
        split="valid",
        batch_size=10,
    )
    test_dataloader1 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=test_dataset1,
        split="test",
        batch_size=10,
    )

    task_to_label_dict = {"task2": "label2"}

    train_dataloader2 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=train_dataset2,
        split="train",
        batch_size=10,
    )
    dev_dataloader2 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=dev_dataset2,
        split="valid",
        batch_size=10,
    )
    test_dataloader2 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=test_dataset2,
        split="test",
        batch_size=10,
    )

    # Create task
    def ce_loss(task_name, immediate_ouput_dict, Y, active):
        module_name = f"{task_name}_pred_head"
        return F.cross_entropy(immediate_ouput_dict[module_name][0][active],
                               (Y.view(-1))[active])

    def output(task_name, immediate_ouput_dict):
        module_name = f"{task_name}_pred_head"
        return F.softmax(immediate_ouput_dict[module_name][0], dim=1)

    task_metrics = {"task1": ["accuracy"], "task2": ["accuracy", "roc_auc"]}

    tasks = [
        EmmentalTask(
            name=task_name,
            module_pool=nn.ModuleDict({
                "input_module":
                nn.Linear(2, 8),
                f"{task_name}_pred_head":
                nn.Linear(8, 2),
            }),
            task_flow=[
                {
                    "name": "input",
                    "module": "input_module",
                    "inputs": [("_input_", "data")],
                },
                {
                    "name": f"{task_name}_pred_head",
                    "module": f"{task_name}_pred_head",
                    "inputs": [("input", 0)],
                },
            ],
            loss_func=partial(ce_loss, task_name),
            output_func=partial(output, task_name),
            scorer=Scorer(metrics=task_metrics[task_name]),
        ) for task_name in ["task1", "task2"]
    ]

    # Build model

    mtl_model = EmmentalModel(name="all", tasks=tasks)

    # Create learner
    emmental_learner = EmmentalLearner()

    # Learning
    emmental_learner.learn(
        mtl_model,
        [
            train_dataloader1, train_dataloader2, dev_dataloader1,
            dev_dataloader2
        ],
    )

    test1_score = mtl_model.score(test_dataloader1)
    test2_score = mtl_model.score(test_dataloader2)

    assert test1_score["task1/synthetic/test/accuracy"] >= 0.7
    assert (test1_score["model/all/test/macro_average"] ==
            test1_score["task1/synthetic/test/accuracy"])
    assert test2_score["task2/synthetic/test/accuracy"] >= 0.7
    assert test2_score["task2/synthetic/test/roc_auc"] >= 0.7

    shutil.rmtree(dirpath)
예제 #17
0
def test_e2e(caplog):
    """Run an end-to-end test."""
    caplog.set_level(logging.INFO)

    dirpath = "temp_test_e2e"

    Meta.reset()
    emmental.init(dirpath)

    # Generate synthetic data
    N = 50
    X = np.random.random((N, 2)) * 2 - 1
    Y1 = (X[:, 0] > X[:, 1] + 0.25).astype(int) + 1
    Y2 = (-X[:, 0] > X[:, 1] + 0.25).astype(int) + 1

    # Create dataset and dataloader

    splits = [0.8, 0.1, 0.1]

    X_train, X_dev, X_test = [], [], []
    Y1_train, Y1_dev, Y1_test = [], [], []
    Y2_train, Y2_dev, Y2_test = [], [], []

    for i in range(N):
        if i <= N * splits[0]:
            X_train.append(torch.Tensor(X[i]))
            Y1_train.append(Y1[i])
            Y2_train.append(Y2[i])
        elif i < N * (splits[0] + splits[1]):
            X_dev.append(torch.Tensor(X[i]))
            Y1_dev.append(Y1[i])
            Y2_dev.append(Y2[i])
        else:
            X_test.append(torch.Tensor(X[i]))
            Y1_test.append(Y1[i])
            Y2_test.append(Y2[i])

    Y1_train = torch.from_numpy(np.array(Y1_train))
    Y1_dev = torch.from_numpy(np.array(Y1_dev))
    Y1_test = torch.from_numpy(np.array(Y1_test))

    Y2_train = torch.from_numpy(np.array(Y1_train))
    Y2_dev = torch.from_numpy(np.array(Y2_dev))
    Y2_test = torch.from_numpy(np.array(Y2_test))

    train_dataset1 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_train}, Y_dict={"label1": Y1_train}
    )

    train_dataset2 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_train}, Y_dict={"label2": Y2_train}
    )

    dev_dataset1 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_dev}, Y_dict={"label1": Y1_dev}
    )

    dev_dataset2 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_dev}, Y_dict={"label2": Y2_dev}
    )

    test_dataset1 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_test}, Y_dict={"label1": Y2_test}
    )

    test_dataset2 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_test}, Y_dict={"label2": Y2_test}
    )

    task_to_label_dict = {"task1": "label1"}

    train_dataloader1 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=train_dataset1,
        split="train",
        batch_size=10,
    )
    dev_dataloader1 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=dev_dataset1,
        split="valid",
        batch_size=10,
    )
    test_dataloader1 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=test_dataset1,
        split="test",
        batch_size=10,
    )

    task_to_label_dict = {"task2": "label2"}

    train_dataloader2 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=train_dataset2,
        split="train",
        batch_size=10,
    )
    dev_dataloader2 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=dev_dataset2,
        split="valid",
        batch_size=10,
    )
    test_dataloader2 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=test_dataset2,
        split="test",
        batch_size=10,
    )

    # Create task
    def ce_loss(task_name, immediate_ouput_dict, Y, active):
        module_name = f"{task_name}_pred_head"
        return F.cross_entropy(
            immediate_ouput_dict[module_name][0][active], (Y.view(-1) - 1)[active]
        )

    def output(task_name, immediate_ouput_dict):
        module_name = f"{task_name}_pred_head"
        return F.softmax(immediate_ouput_dict[module_name][0], dim=1)

    task_name = "task1"

    task1 = EmmentalTask(
        name=task_name,
        module_pool=nn.ModuleDict(
            {"input_module": nn.Linear(2, 8), f"{task_name}_pred_head": nn.Linear(8, 2)}
        ),
        task_flow=[
            {
                "name": "input",
                "module": "input_module",
                "inputs": [("_input_", "data")],
            },
            {
                "name": f"{task_name}_pred_head",
                "module": f"{task_name}_pred_head",
                "inputs": [("input", 0)],
            },
        ],
        loss_func=partial(ce_loss, task_name),
        output_func=partial(output, task_name),
        scorer=Scorer(metrics=["accuracy", "roc_auc"]),
    )

    task_name = "task2"

    task2 = EmmentalTask(
        name=task_name,
        module_pool=nn.ModuleDict(
            {"input_module": nn.Linear(2, 8), f"{task_name}_pred_head": nn.Linear(8, 2)}
        ),
        task_flow=[
            {
                "name": "input",
                "module": "input_module",
                "inputs": [("_input_", "data")],
            },
            {
                "name": f"{task_name}_pred_head",
                "module": f"{task_name}_pred_head",
                "inputs": [("input", 0)],
            },
        ],
        loss_func=partial(ce_loss, task_name),
        output_func=partial(output, task_name),
        scorer=Scorer(metrics=["accuracy", "roc_auc"]),
    )

    # Build model

    mtl_model = EmmentalModel(name="all", tasks=[task1, task2])

    # Create learner

    emmental_learner = EmmentalLearner()

    # Update learning config
    Meta.update_config(
        config={"learner_config": {"n_epochs": 10, "optimizer_config": {"lr": 0.01}}}
    )

    # Learning
    emmental_learner.learn(
        mtl_model,
        [train_dataloader1, train_dataloader2, dev_dataloader1, dev_dataloader2],
    )

    test1_score = mtl_model.score(test_dataloader1)
    test2_score = mtl_model.score(test_dataloader2)

    assert test1_score["task1/synthetic/test/accuracy"] >= 0.5
    assert test1_score["task1/synthetic/test/roc_auc"] >= 0.6
    assert test2_score["task2/synthetic/test/accuracy"] >= 0.5
    assert test2_score["task2/synthetic/test/roc_auc"] >= 0.6

    shutil.rmtree(dirpath)
예제 #18
0
def parse(csv_path, tokenizer, uid, max_data_samples, max_sequence_length):
    logger.info(f"Loading data from {csv_path}.")
    rows = pd.read_csv(csv_path)

    # Truncate to max_data_samples
    if max_data_samples:
        rows = rows[:max_data_samples]
        logger.info(f"Truncating to {max_data_samples} samples.")

    # unique ids
    uids = []
    # sentence1
    sent1s = []
    # sentence2
    sent2s = []
    # choice1
    choice1s = []
    # choice2
    choice2s = []
    # choice3
    choice3s = []
    # choice4
    choice4s = []

    labels = []

    bert_token1_ids = []
    bert_token2_ids = []
    bert_token3_ids = []
    bert_token4_ids = []

    bert_token1_masks = []
    bert_token2_masks = []
    bert_token3_masks = []
    bert_token4_masks = []

    bert_token1_segments = []
    bert_token2_segments = []
    bert_token3_segments = []
    bert_token4_segments = []

    # Check the maximum token length
    max_len = -1

    for ex_idx, ex in rows.iterrows():
        sent1 = ex["sent1"]
        sent2 = ex["sent2"]

        choice1 = ex["ending0"]
        choice2 = ex["ending1"]
        choice3 = ex["ending2"]
        choice4 = ex["ending3"]

        label = ex["label"] if "label" in ex else 0

        uids.append(ex_idx)
        sent1s.append(sent1)
        sent2s.append(sent2)
        choice1s.append(choice1)
        choice2s.append(choice2)
        choice3s.append(choice3)
        choice4s.append(choice4)

        labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label])

        # Tokenize sentences
        sent1_tokens = tokenizer.tokenize(sent1)
        sent2_tokens = tokenizer.tokenize(sent2)
        choice1_tokens = tokenizer.tokenize(choice1)
        choice2_tokens = tokenizer.tokenize(choice2)
        choice3_tokens = tokenizer.tokenize(choice3)
        choice4_tokens = tokenizer.tokenize(choice4)

        # Convert to BERT manner
        token1 = (sent1_tokens + ["[SEP]"] + sent2_tokens + choice1_tokens +
                  ["[SEP]"] + ["[CLS]"])
        token2 = (sent1_tokens + ["[SEP]"] + sent2_tokens + choice2_tokens +
                  ["[SEP]"] + ["[CLS]"])
        token3 = (sent1_tokens + ["[SEP]"] + sent2_tokens + choice3_tokens +
                  ["[SEP]"] + ["[CLS]"])
        token4 = (sent1_tokens + ["[SEP]"] + sent2_tokens + choice4_tokens +
                  ["[SEP]"] + ["[CLS]"])

        max_choice_len = 0

        token1_ids = tokenizer.convert_tokens_to_ids(
            token1)[:max_sequence_length]
        token2_ids = tokenizer.convert_tokens_to_ids(
            token2)[:max_sequence_length]
        token3_ids = tokenizer.convert_tokens_to_ids(
            token3)[:max_sequence_length]
        token4_ids = tokenizer.convert_tokens_to_ids(
            token4)[:max_sequence_length]

        token1_masks = [1] * len(token1_ids)
        token2_masks = [1] * len(token2_ids)
        token3_masks = [1] * len(token3_ids)
        token4_masks = [1] * len(token4_ids)

        token1_segments = [0] * len(token1_ids)
        token2_segments = [0] * len(token2_ids)
        token3_segments = [0] * len(token3_ids)
        token4_segments = [0] * len(token4_ids)

        if len(token1_ids) > max_len:
            max_len = len(token1_ids)
        if len(token2_ids) > max_len:
            max_len = len(token2_ids)
        if len(token3_ids) > max_len:
            max_len = len(token3_ids)
        if len(token4_ids) > max_len:
            max_len = len(token4_ids)

        max_choice_len = max(max_choice_len, len(token1_ids))
        max_choice_len = max(max_choice_len, len(token2_ids))
        max_choice_len = max(max_choice_len, len(token3_ids))
        max_choice_len = max(max_choice_len, len(token4_ids))

        token1_ids += [0] * (max_choice_len - len(token1_ids))
        token2_ids += [0] * (max_choice_len - len(token2_ids))
        token3_ids += [0] * (max_choice_len - len(token3_ids))
        token4_ids += [0] * (max_choice_len - len(token4_ids))

        token1_masks += [0] * (max_choice_len - len(token1_masks))
        token2_masks += [0] * (max_choice_len - len(token2_masks))
        token3_masks += [0] * (max_choice_len - len(token3_masks))
        token4_masks += [0] * (max_choice_len - len(token4_masks))

        token1_segments += [0] * (max_choice_len - len(token1_segments))
        token2_segments += [0] * (max_choice_len - len(token2_segments))
        token3_segments += [0] * (max_choice_len - len(token3_segments))
        token4_segments += [0] * (max_choice_len - len(token4_segments))

        bert_token1_ids.append(torch.LongTensor(token1_ids))
        bert_token2_ids.append(torch.LongTensor(token2_ids))
        bert_token3_ids.append(torch.LongTensor(token3_ids))
        bert_token4_ids.append(torch.LongTensor(token4_ids))

        bert_token1_masks.append(torch.LongTensor(token1_masks))
        bert_token2_masks.append(torch.LongTensor(token2_masks))
        bert_token3_masks.append(torch.LongTensor(token3_masks))
        bert_token4_masks.append(torch.LongTensor(token4_masks))

        bert_token1_segments.append(torch.LongTensor(token1_segments))
        bert_token2_segments.append(torch.LongTensor(token2_segments))
        bert_token3_segments.append(torch.LongTensor(token3_segments))
        bert_token4_segments.append(torch.LongTensor(token4_segments))

    labels = torch.from_numpy(np.array(labels))

    logger.info(f"Max token len {max_len}")

    return EmmentalDataset(
        name="SuperGLUE",
        uid="uids",
        X_dict={
            "uids": uids,
            "sentence1": sent1s,
            "sentence2": sent2s,
            "choice1": choice1s,
            "choice2": choice2s,
            "choice3": choice3s,
            "choice4": choice4s,
            "token1_ids": bert_token1_ids,
            "token2_ids": bert_token2_ids,
            "token3_ids": bert_token3_ids,
            "token4_ids": bert_token4_ids,
            "token1_masks": bert_token1_masks,
            "token2_masks": bert_token2_masks,
            "token3_masks": bert_token3_masks,
            "token4_masks": bert_token4_masks,
            "token1_segments": bert_token1_segments,
            "token2_segments": bert_token2_segments,
            "token3_segments": bert_token3_segments,
            "token4_segments": bert_token4_segments,
        },
        Y_dict={"labels": labels},
    )
예제 #19
0
def parse(jsonl_path, tokenizer, uid, max_data_samples, max_sequence_length):
    logger.info(f"Loading data from {jsonl_path}.")
    rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")]
    for i in range(2):
        logger.info(f"Sample {i}: {rows[i]}")

    # Truncate to max_data_samples
    if max_data_samples:
        rows = rows[:max_data_samples]
        logger.info(f"Truncating to {max_data_samples} samples.")

    # unique ids
    uids = []
    # paragraph ids
    pids = []
    # question ids
    qids = []
    # answer ids
    aids = []

    # paragraph text
    paras = []
    # question text
    questions = []
    # answer text
    answers = []
    # labels
    labels = []

    bert_tokens = []
    bert_token_ids = []
    bert_token_masks = []
    bert_token_segments = []

    # Check the maximum token length
    max_len = -1

    for row in rows:
        # each example has a paragraph field -> (text, questions)
        # text is the paragraph, which requires some preprocessing
        # questions is a list of questions,
        # has fields (question, sentences_used, answers)
        pid = row["idx"]
        para = row["paragraph"]["text"]
        para_sent_list = re.sub(
            "<b>Sent .{1,2}: </b>", "", row["paragraph"]["text"]
        ).split("<br>")

        for ques in row["paragraph"]["questions"]:
            qid = ques["idx"]
            sent_used = ques["sentences_used"]

            if len(sent_used) > 0:
                ques_para = " ".join([para_sent_list[i] for i in sent_used])
            else:
                ques_para = " ".join(para_sent_list)

            para_token = tokenizer.tokenize(ques_para)[: max_sequence_length - 2]

            question = ques["question"]
            question_token = tokenizer.tokenize(question)[: max_sequence_length - 2]

            for ans in ques["answers"]:
                aid = ans["idx"]
                answer = ans["text"]
                answer_token = tokenizer.tokenize(answer)[: max_sequence_length - 2]

                # Generate tokens
                tokens = (
                    para_token
                    + ["[SEP]"]
                    + question_token
                    + answer_token
                    + ["[SEP]"]
                    + ["[CLS]"]
                )
                # No token segments
                token_segments = [0] * (len(para_token) + 2) + [0] * (
                    len(question_token) + len(answer_token) + 1
                )
                token_ids = tokenizer.convert_tokens_to_ids(tokens)
                token_masks = [1] * len(token_ids)

                if len(tokens) > max_len:
                    max_len = len(tokens)

                # Add to list
                paras.append(para)
                questions.append(question)
                answers.append(answer)

                label = ans["isAnswer"] if "isAnswer" in ans else False
                labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label])

                pids.append(pid)
                qids.append(qid)
                aids.append(aid)

                uids.append(f"{pid}%%{qid}%%{aid}")

                bert_tokens.append(" ".join(tokens))
                bert_token_ids.append(torch.LongTensor(token_ids))
                bert_token_masks.append(torch.LongTensor(token_masks))
                bert_token_segments.append(torch.LongTensor(token_segments))

    labels = torch.from_numpy(np.array(labels))

    logger.info(f"Max token len {max_len}")

    return EmmentalDataset(
        name="SuperGLUE",
        uid=uid,
        X_dict={
            "uids": uids,
            "pids": pids,
            "qids": qids,
            "aids": aids,
            "paras": paras,
            "questions": questions,
            "answers": answers,
            "tokens": bert_tokens,
            "token_ids": bert_token_ids,
            "token_masks": bert_token_masks,
            "token_segments": bert_token_segments,
        },
        Y_dict={"labels": labels},
    )
예제 #20
0
def parse(jsonl_path, tokenizer, uid, max_data_samples, max_sequence_length):
    logger.info(f"Loading data from {jsonl_path}.")
    rows = [json.loads(row) for row in open(jsonl_path, encoding="utf-8")]
    for i in range(2):
        logger.info(f"Sample {i}: {rows[i]}")

    # Truncate to max_data_samples
    if max_data_samples:
        rows = rows[:max_data_samples]
        logger.info(f"Truncating to {max_data_samples} samples.")

    # unique ids
    uids = []

    # sentence text
    sentences = []
    # span1
    span1s = []
    # span2
    span2s = []
    # span1 idx
    span1_idxs = []
    # span2 idx
    span2_idxs = []
    # label
    labels = []

    token1_idxs = []
    token2_idxs = []

    bert_tokens = []
    bert_token_ids = []
    bert_token_masks = []
    bert_token_segments = []

    # Check the maximum token length
    max_len = -1

    for row in rows:
        index = row["idx"]

        text = row["text"]
        span1_text = row["target"]["span1_text"]
        span2_text = row["target"]["span2_text"]
        span1_index = row["target"]["span1_index"]
        span2_index = row["target"]["span2_index"]

        label = row["label"] if "label" in row else True

        span1_char_index = get_char_index(text, span1_text, span1_index)
        span2_char_index = get_char_index(text, span2_text, span2_index)

        assert span1_char_index is not None, f"Check example {id} in {jsonl_path}"
        assert span2_char_index is not None, f"Check example {id} in {jsonl_path}"

        # Tokenize sentences
        bert_tokens_sub1 = tokenizer.tokenize(
            text[: min(span1_char_index[0], span2_char_index[0])]
        )

        if span1_char_index[0] < span2_char_index[0]:
            bert_tokens_sub2 = tokenizer.tokenize(
                text[span1_char_index[0] : span1_char_index[1]]
            )
            token1_idx = [
                len(bert_tokens_sub1) + 1,
                len(bert_tokens_sub1) + len(bert_tokens_sub2),
            ]
        else:
            bert_tokens_sub2 = tokenizer.tokenize(
                text[span2_char_index[0] : span2_char_index[1]]
            )
            token2_idx = [
                len(bert_tokens_sub1) + 1,
                len(bert_tokens_sub1) + len(bert_tokens_sub2),
            ]

        sub3_st = (
            span1_char_index[1]
            if span1_char_index[0] < span2_char_index[0]
            else span2_char_index[1]
        )
        sub3_ed = (
            span1_char_index[0]
            if span1_char_index[0] > span2_char_index[0]
            else span2_char_index[0]
        )

        bert_tokens_sub3 = tokenizer.tokenize(text[sub3_st:sub3_ed])
        if span1_char_index[0] < span2_char_index[0]:
            bert_tokens_sub4 = tokenizer.tokenize(
                text[span2_char_index[0] : span2_char_index[1]]
            )
            cur_len = (
                len(bert_tokens_sub1) + len(bert_tokens_sub2) + len(bert_tokens_sub3)
            )
            token2_idx = [cur_len + 1, cur_len + len(bert_tokens_sub4)]
        else:
            bert_tokens_sub4 = tokenizer.tokenize(
                text[span1_char_index[0] : span1_char_index[1]]
            )
            cur_len = (
                len(bert_tokens_sub1) + len(bert_tokens_sub2) + len(bert_tokens_sub3)
            )
            token1_idx = [cur_len + 1, cur_len + len(bert_tokens_sub4)]

        if span1_char_index[0] < span2_char_index[0]:
            bert_tokens_sub5 = tokenizer.tokenize(text[span2_char_index[1] :])
        else:
            bert_tokens_sub5 = tokenizer.tokenize(text[span1_char_index[1] :])

        tokens = (
            ["[CLS]"]
            + bert_tokens_sub1
            + bert_tokens_sub2
            + bert_tokens_sub3
            + bert_tokens_sub4
            + bert_tokens_sub5
            + ["[SEP]"]
        )

        if len(tokens) > max_len:
            max_len = len(tokens)

        token_ids = tokenizer.convert_tokens_to_ids(tokens)

        token_segments = [0] * len(token_ids)
        # Generate mask where 1 for real tokens and 0 for padding tokens
        token_masks = [1] * len(token_ids)

        token1_idxs.append(token1_idx)
        token2_idxs.append(token2_idx)

        uids.append(index)
        sentences.append(text)
        span1s.append(span1_text)
        span2s.append(span2_text)
        span1_idxs.append(span1_index)
        span2_idxs.append(span2_index)
        labels.append(SuperGLUE_LABEL_MAPPING[TASK_NAME][label])

        bert_tokens.append(tokens)
        bert_token_ids.append(torch.LongTensor(token_ids))
        bert_token_masks.append(torch.LongTensor(token_masks))
        bert_token_segments.append(torch.LongTensor(token_segments))

    token1_idxs = torch.from_numpy(np.array(token1_idxs))
    token2_idxs = torch.from_numpy(np.array(token2_idxs))

    labels = torch.from_numpy(np.array(labels))

    logger.info(f"Max token len {max_len}")

    return EmmentalDataset(
        name="SuperGLUE",
        uid="uids",
        X_dict={
            "uids": uids,
            "sentence": sentences,
            "span1": span1s,
            "span2": span2s,
            "span1_idx": span1_idxs,
            "span2_idx": span2_idxs,
            "token1_idx": token1_idxs,
            "token2_idx": token2_idxs,
            "tokens": bert_tokens,
            "token_ids": bert_token_ids,
            "token_masks": bert_token_masks,
            "token_segments": bert_token_segments,
        },
        Y_dict={"labels": labels},
    )