Пример #1
0
def test_dataset_feature_with_none(feature):
    data = {"col": [None]}
    features = Features({"col": feature})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"col"}
    assert item["col"] is None
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"col"}
    assert isinstance(batch["col"], list) and all(item is None
                                                  for item in batch["col"])
    column = dset["col"]
    assert len(column) == 1
    assert isinstance(column, list) and all(item is None for item in column)

    # nested tests

    data = {"col": [[None]]}
    features = Features({"col": Sequence(feature)})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"col"}
    assert all(i is None for i in item["col"])

    data = {"nested": [{"col": None}]}
    features = Features({"nested": {"col": feature}})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"nested"}
    assert item["nested"].keys() == {"col"}
    assert item["nested"]["col"] is None
Пример #2
0
    def test_read(self):
        name = "my_name"
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_infos = [train_info, test_info]
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)

        with tempfile.TemporaryDirectory() as tmp_dir:
            reader = ReaderTest(tmp_dir, info)

            instructions = "test[:33%]"
            dset = Dataset(**reader.read(name, instructions, split_infos))
            self.assertEqual(dset["filename"][0], f"{name}-test")
            self.assertEqual(dset.num_rows, 33)
            self.assertEqual(dset.num_columns, 1)

            instructions = ["train", "test[:33%]"]
            datasets_kwargs = [
                reader.read(name, instr, split_infos) for instr in instructions
            ]
            train_dset, test_dset = [
                Dataset(**dataset_kwargs) for dataset_kwargs in datasets_kwargs
            ]
            self.assertEqual(train_dset["filename"][0], f"{name}-train")
            self.assertEqual(train_dset.num_rows, 100)
            self.assertEqual(train_dset.num_columns, 1)
            self.assertEqual(test_dset["filename"][0], f"{name}-test")
            self.assertEqual(test_dset.num_rows, 33)
            self.assertEqual(test_dset.num_columns, 1)
            del train_dset, test_dset
Пример #3
0
 def test_from_arrow_schema_with_sequence(self):
     data = {"a": [{"b": {"c": ["text"]}}] * 10, "foo": [1] * 10}
     original_features = Features({"a": {"b": Sequence({"c": Value("string")})}, "foo": Value("int64")})
     dset = Dataset.from_dict(data, features=original_features)
     new_features = dset.features
     new_dset = Dataset.from_dict(data, features=new_features)
     self.assertEqual(original_features.type, new_features.type)
     self.assertDictEqual(dset[0], new_dset[0])
     self.assertDictEqual(dset[:], new_dset[:])
Пример #4
0
 def _create_dummy_dataset(self, multiple_columns=False):
     if multiple_columns:
         data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}
         dset = Dataset.from_dict(data)
     else:
         dset = Dataset.from_dict(
             {"filename": ["my_name-train" + "_" + "{:03d}".format(x) for x in np.arange(30).tolist()]}
         )
     return dset
Пример #5
0
 def test_align_labels_with_mapping(self):
     train_features = Features({
         "input_text":
         Value("string"),
         "input_labels":
         ClassLabel(num_classes=3,
                    names=["entailment", "neutral", "contradiction"]),
     })
     test_features = Features({
         "input_text":
         Value("string"),
         "input_labels":
         ClassLabel(num_classes=3,
                    names=["entailment", "contradiction", "neutral"]),
     })
     train_data = {
         "input_text": ["a", "a", "b", "b", "c", "c"],
         "input_labels": [0, 0, 1, 1, 2, 2]
     }
     test_data = {
         "input_text": ["a", "a", "c", "c", "b", "b"],
         "input_labels": [0, 0, 1, 1, 2, 2]
     }
     label2id = {"CONTRADICTION": 0, "ENTAILMENT": 2, "NEUTRAL": 1}
     id2label = {v: k for k, v in label2id.items()}
     train_expected_labels = [2, 2, 1, 1, 0, 0]
     test_expected_labels = [2, 2, 0, 0, 1, 1]
     train_expected_label_names = [
         id2label[idx] for idx in train_expected_labels
     ]
     test_expected_label_names = [
         id2label[idx] for idx in test_expected_labels
     ]
     dsets = DatasetDict({
         "train":
         Dataset.from_dict(train_data, features=train_features),
         "test":
         Dataset.from_dict(test_data, features=test_features),
     })
     dsets = dsets.align_labels_with_mapping(label2id, "input_labels")
     self.assertListEqual(train_expected_labels,
                          dsets["train"]["input_labels"])
     self.assertListEqual(test_expected_labels,
                          dsets["test"]["input_labels"])
     train_aligned_label_names = [
         dsets["train"].features["input_labels"].int2str(idx)
         for idx in dsets["train"]["input_labels"]
     ]
     test_aligned_label_names = [
         dsets["test"].features["input_labels"].int2str(idx)
         for idx in dsets["test"]["input_labels"]
     ]
     self.assertListEqual(train_expected_label_names,
                          train_aligned_label_names)
     self.assertListEqual(test_expected_label_names,
                          test_aligned_label_names)
Пример #6
0
 def test_flatten(self):
     dset_split = Dataset.from_dict(
         {
             "a": [{
                 "b": {
                     "c": ["text"]
                 }
             }] * 10,
             "foo": [1] * 10
         },
         features=Features({
             "a": {
                 "b": Sequence({"c": Value("string")})
             },
             "foo": Value("int64")
         }),
     )
     dset = DatasetDict({"train": dset_split, "test": dset_split})
     dset = dset.flatten()
     self.assertDictEqual(dset.column_names, {
         "train": ["a.b.c", "foo"],
         "test": ["a.b.c", "foo"]
     })
     self.assertListEqual(sorted(dset["train"].features.keys()),
                          ["a.b.c", "foo"])
     self.assertDictEqual(
         dset["train"].features,
         Features({
             "a.b.c": Sequence(Value("string")),
             "foo": Value("int64")
         }))
     del dset
Пример #7
0
def dataset():
    n = 10
    features = Features({
        "tokens":
        Sequence(Value("string")),
        "labels":
        Sequence(ClassLabel(names=["negative", "positive"])),
        "answers":
        Sequence({
            "text": Value("string"),
            "answer_start": Value("int32"),
        }),
        "id":
        Value("int64"),
    })
    dataset = Dataset.from_dict(
        {
            "tokens": [["foo"] * 5] * n,
            "labels": [[1] * 5] * n,
            "answers": [{
                "answer_start": [97],
                "text": ["1976"]
            }] * 10,
            "id": list(range(n)),
        },
        features=features,
    )
    return dataset
Пример #8
0
    def test_map_on_task_template(self):
        info = DatasetInfo(task_templates=QuestionAnsweringExtractive())
        dataset = Dataset.from_dict({k: [v] for k, v in SAMPLE_QUESTION_ANSWERING_EXTRACTIVE.items()}, info=info)
        assert isinstance(dataset.info.task_templates, list)
        assert len(dataset.info.task_templates) == 1

        def keep_task(x):
            return x

        def dont_keep_task(x):
            out = deepcopy(SAMPLE_QUESTION_ANSWERING_EXTRACTIVE)
            out["answers"]["foobar"] = 0
            return out

        mapped_dataset = dataset.map(keep_task)
        assert mapped_dataset.info.task_templates == dataset.info.task_templates
        # reload from cache
        mapped_dataset = dataset.map(keep_task)
        assert mapped_dataset.info.task_templates == dataset.info.task_templates

        mapped_dataset = dataset.map(dont_keep_task)
        assert mapped_dataset.info.task_templates == []
        # reload from cache
        mapped_dataset = dataset.map(dont_keep_task)
        assert mapped_dataset.info.task_templates == []
Пример #9
0
    def test_read_files(self):
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)

        with tempfile.TemporaryDirectory() as tmp_dir:
            reader = ReaderTest(tmp_dir, info)

            files = [
                {
                    "filename": os.path.join(tmp_dir, "train")
                },
                {
                    "filename": os.path.join(tmp_dir, "test"),
                    "skip": 10,
                    "take": 10
                },
            ]
            dset = Dataset(
                **reader.read_files(files, original_instructions=""))
            self.assertEqual(dset.num_rows, 110)
            self.assertEqual(dset.num_columns, 1)
            self.assertEqual(dset._data_files, files)
            del dset
Пример #10
0
def convert_glue_samples_to_hf_dataset(samples, glue_labels, is_test=False):
    idx_list = []
    text_a_list = []
    text_b_list = []
    labels_list = []

    for guid, text_a, text_b, label in tqdm(samples):
        idx_list.append(guid)
        text_a_list.append(text_a)
        text_b_list.append(text_b)
        if not is_test:
            labels_list.append(label)

    data_dict = {
        'idx': idx_list,
        'sentence1': text_a_list,
    }
    if any(text_b_list):
        data_dict['sentence2'] = text_b_list
    if not is_test:
        check_labels(glue_labels, labels_list)
        data_dict['label'] = labels_list

    dataset = ArrowDataset.from_dict(data_dict)

    return dataset
    def test_read(self):
        name = "my_name"
        train_info = SplitInfo(name="train", num_examples=100)
        test_info = SplitInfo(name="test", num_examples=100)
        split_infos = [train_info, test_info]
        split_dict = SplitDict()
        split_dict.add(train_info)
        split_dict.add(test_info)
        info = DatasetInfo(splits=split_dict)

        with tempfile.TemporaryDirectory() as tmp_dir:
            reader = ReaderTest(tmp_dir, info)

            instructions = "test[:33%]"
            dset = Dataset(**reader.read(name, instructions, split_infos))
            self.assertEqual(dset["filename"][0], f"{name}-test")
            self.assertEqual(dset.num_rows, 33)
            self.assertEqual(dset.num_columns, 1)

            instructions1 = ["train", "test[:33%]"]
            instructions2 = [
                Split.TRAIN,
                ReadInstruction.from_spec("test[:33%]")
            ]
            for instructions in [instructions1, instructions2]:
                datasets_kwargs = [
                    reader.read(name, instr, split_infos)
                    for instr in instructions
                ]
                train_dset, test_dset = (Dataset(**dataset_kwargs)
                                         for dataset_kwargs in datasets_kwargs)
                self.assertEqual(train_dset["filename"][0], f"{name}-train")
                self.assertEqual(train_dset.num_rows, 100)
                self.assertEqual(train_dset.num_columns, 1)
                self.assertIsInstance(train_dset.split, NamedSplit)
                self.assertEqual(str(train_dset.split), "train")
                self.assertEqual(test_dset["filename"][0], f"{name}-test")
                self.assertEqual(test_dset.num_rows, 33)
                self.assertEqual(test_dset.num_columns, 1)
                self.assertIsInstance(test_dset.split, NamedSplit)
                self.assertEqual(str(test_dset.split), "test[:33%]")
                del train_dset, test_dset
Пример #12
0
    def test_remove_and_map_on_task_template(self):
        features = Features({"text": Value("string"), "label": ClassLabel(names=("pos", "neg"))})
        task_templates = TextClassification(text_column="text", label_column="label")
        info = DatasetInfo(features=features, task_templates=task_templates)
        dataset = Dataset.from_dict({"text": ["A sentence."], "label": ["pos"]}, info=info)

        def process(example):
            return example

        modified_dataset = dataset.remove_columns("label")
        mapped_dataset = modified_dataset.map(process)
        assert mapped_dataset.info.task_templates == []
Пример #13
0
def arrow_path(tmp_path_factory):
    dataset = Dataset.from_dict(DATA_DICT_OF_LISTS)
    path = str(tmp_path_factory.mktemp("data") / "dataset.arrow")
    dataset.map(cache_file_name=path)
    return path
Пример #14
0
 def _create_dummy_dataset(self):
     dset = Dataset.from_dict({"filename": ["my_name-train" + "_" + str(x) for x in np.arange(30).tolist()]})
     return dset
Пример #15
0
def dataset():
    features = Features(
        {"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"]))}
    )
    dataset = Dataset.from_dict({"tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10}, features=features)
    return dataset