示例#1
0
def test_dataset_with_image_feature_with_none():
    data = {"image": [None]}
    features = Features({"image": Image()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"image"}
    assert item["image"] is None
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"image"}
    assert isinstance(batch["image"], list) and all(item is None for item in batch["image"])
    column = dset["image"]
    assert len(column) == 1
    assert isinstance(column, list) and all(item is None for item in column)

    # nested tests

    data = {"images": [[None]]}
    features = Features({"images": Sequence(Image())})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"images"}
    assert all(i is None for i in item["images"])

    data = {"nested": [{"image": None}]}
    features = Features({"nested": {"image": Image()}})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"nested"}
    assert item["nested"].keys() == {"image"}
    assert item["nested"]["image"] is None
示例#2
0
def test_text_datasetdict_reader(split, features, keep_in_memory, text_path,
                                 tmp_path):
    if split:
        path = {split: text_path}
    else:
        split = "train"
        path = {"train": text_path, "test": text_path}
    cache_dir = tmp_path / "cache"

    default_expected_features = {"text": "string"}
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = TextDatasetReader(path,
                                    features=features,
                                    cache_dir=cache_dir,
                                    keep_in_memory=keep_in_memory).read()
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 1
    assert dataset.column_names == ["text"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
示例#3
0
def test_datasetdict_from_json(
    split,
    features,
    keep_in_memory,
    jsonl_path,
    tmp_path,
):
    file_path = jsonl_path
    field = None
    if split:
        path = {split: file_path}
    else:
        split = "train"
        path = {"train": file_path, "test": file_path}
    cache_dir = tmp_path / "cache"
    default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"}
    expected_features = features.copy() if features else default_expected_features
    features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = DatasetDict.from_json(
            path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, field=field
        )
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 3
    assert dataset.column_names == ["col_1", "col_2", "col_3"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
示例#4
0
def test_image_feature_type_to_arrow():
    features = Features({"image": Image()})
    assert features.arrow_schema == pa.schema({"image": Image().pa_type})
    features = Features({"struct_containing_an_image": {"image": Image()}})
    assert features.arrow_schema == pa.schema({"struct_containing_an_image": pa.struct({"image": Image().pa_type})})
    features = Features({"sequence_of_images": Sequence(Image())})
    assert features.arrow_schema == pa.schema({"sequence_of_images": pa.list_(Image().pa_type)})
示例#5
0
def test_csv_dataset_reader(path_type, split, features, keep_in_memory,
                            csv_path, tmp_path):
    if issubclass(path_type, str):
        path = csv_path
    elif issubclass(path_type, list):
        path = [csv_path]
    cache_dir = tmp_path / "cache"

    expected_split = str(split) if split else "train"

    # CSV file loses col_1 string dtype information: default now is "int64" instead of "string"
    default_expected_features = {
        "col_1": "int64",
        "col_2": "int64",
        "col_3": "float64"
    }
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = CsvDatasetReader(path,
                                   split=split,
                                   features=features,
                                   cache_dir=cache_dir,
                                   keep_in_memory=keep_in_memory).read()
    assert isinstance(dataset, Dataset)
    assert dataset.num_rows == 4
    assert dataset.num_columns == 3
    assert dataset.column_names == ["col_1", "col_2", "col_3"]
    assert dataset.split == expected_split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
示例#6
0
 def test_flatten(self):
     dset_split = Dataset.from_dict(
         {
             "a": [{
                 "b": {
                     "c": ["text"]
                 }
             }] * 10,
             "foo": [1] * 10
         },
         features=Features({
             "a": {
                 "b": Sequence({"c": Value("string")})
             },
             "foo": Value("int64")
         }),
     )
     dset = DatasetDict({"train": dset_split, "test": dset_split})
     dset = dset.flatten()
     self.assertDictEqual(dset.column_names, {
         "train": ["a.b.c", "foo"],
         "test": ["a.b.c", "foo"]
     })
     self.assertListEqual(list(dset["train"].features.keys()),
                          ["a.b.c", "foo"])
     self.assertDictEqual(
         dset["train"].features,
         Features({
             "a.b.c": Sequence(Value("string")),
             "foo": Value("int64")
         }))
     del dset
示例#7
0
def test_datasetdict_from_csv(split, features, keep_in_memory, csv_path,
                              tmp_path):
    if split:
        path = {split: csv_path}
    else:
        split = "train"
        path = {"train": csv_path, "test": csv_path}
    cache_dir = tmp_path / "cache"
    # CSV file loses col_1 string dtype information: default now is "int64" instead of "string"
    default_expected_features = {
        "col_1": "int64",
        "col_2": "int64",
        "col_3": "float64"
    }
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = DatasetDict.from_csv(path,
                                       features=features,
                                       cache_dir=cache_dir,
                                       keep_in_memory=keep_in_memory)
    assert isinstance(dataset, DatasetDict)
    dataset = dataset[split]
    assert dataset.num_rows == 4
    assert dataset.num_columns == 3
    assert dataset.column_names == ["col_1", "col_2", "col_3"]
    assert dataset.split == split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
示例#8
0
def test_text_dataset_reader(path_type, split, features, keep_in_memory,
                             text_path, tmp_path):
    if issubclass(path_type, str):
        path = text_path
    elif issubclass(path_type, list):
        path = [text_path]
    cache_dir = tmp_path / "cache"

    expected_split = str(split) if split else "train"

    default_expected_features = {"text": "string"}
    expected_features = features.copy(
    ) if features else default_expected_features
    features = Features(
        {feature: Value(dtype)
         for feature, dtype in features.items()}) if features else None
    with assert_arrow_memory_increases(
    ) if keep_in_memory else assert_arrow_memory_doesnt_increase():
        dataset = TextDatasetReader(path,
                                    split=split,
                                    features=features,
                                    cache_dir=cache_dir,
                                    keep_in_memory=keep_in_memory).read()
    assert isinstance(dataset, Dataset)
    assert dataset.num_rows == 4
    assert dataset.num_columns == 1
    assert dataset.column_names == ["text"]
    assert dataset.split == expected_split
    for feature, expected_dtype in expected_features.items():
        assert dataset.features[feature].dtype == expected_dtype
示例#9
0
    def test_shuffle(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            dsets = self._create_dummy_dataset_dict()

            indices_cache_file_names = {
                "train": os.path.join(tmp_dir, "train.arrow"),
                "test": os.path.join(tmp_dir, "test.arrow"),
            }
            seeds = {
                "train": 1234,
                "test": 1234,
            }
            dsets_shuffled = dsets.shuffle(
                seeds=seeds, indices_cache_file_names=indices_cache_file_names, load_from_cache_file=False
            )
            self.assertListEqual(dsets_shuffled["train"]["filename"], dsets_shuffled["test"]["filename"])

            self.assertEqual(len(dsets_shuffled["train"]), 30)
            self.assertEqual(dsets_shuffled["train"][0]["filename"], "my_name-train_028")
            self.assertEqual(dsets_shuffled["train"][2]["filename"], "my_name-train_010")
            self.assertDictEqual(dsets["train"].features, Features({"filename": Value("string")}))
            self.assertDictEqual(dsets_shuffled["train"].features, Features({"filename": Value("string")}))

            # Reproducibility
            indices_cache_file_names_2 = {
                "train": os.path.join(tmp_dir, "train_2.arrow"),
                "test": os.path.join(tmp_dir, "test_2.arrow"),
            }
            dsets_shuffled_2 = dsets.shuffle(
                seeds=seeds, indices_cache_file_names=indices_cache_file_names_2, load_from_cache_file=False
            )
            self.assertListEqual(dsets_shuffled["train"]["filename"], dsets_shuffled_2["train"]["filename"])

            seeds = {
                "train": 1234,
                "test": 1,
            }
            indices_cache_file_names_3 = {
                "train": os.path.join(tmp_dir, "train_3.arrow"),
                "test": os.path.join(tmp_dir, "test_3.arrow"),
            }
            dsets_shuffled_3 = dsets.shuffle(
                seeds=seeds, indices_cache_file_names=indices_cache_file_names_3, load_from_cache_file=False
            )
            self.assertNotEqual(dsets_shuffled_3["train"]["filename"], dsets_shuffled_3["test"]["filename"])

            # other input types
            dsets_shuffled_int = dsets.shuffle(42)
            dsets_shuffled_alias = dsets.shuffle(seed=42)
            dsets_shuffled_none = dsets.shuffle()
            self.assertEqual(len(dsets_shuffled_int["train"]), 30)
            self.assertEqual(len(dsets_shuffled_alias["train"]), 30)
            self.assertEqual(len(dsets_shuffled_none["train"]), 30)

            del dsets, dsets_shuffled, dsets_shuffled_2, dsets_shuffled_3
            del dsets_shuffled_int, dsets_shuffled_alias, dsets_shuffled_none
示例#10
0
def test_dataset_concatenate_image_features(shared_datadir):
    # we use a different data structure between 1 and 2 to make sure they are compatible with each other
    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data1 = {"image": [image_path]}
    dset1 = Dataset.from_dict(data1, features=Features({"image": Image()}))
    data2 = {"image": [{"bytes": open(image_path, "rb").read()}]}
    dset2 = Dataset.from_dict(data2, features=Features({"image": Image()}))
    concatenated_dataset = concatenate_datasets([dset1, dset2])
    assert len(concatenated_dataset) == len(dset1) + len(dset2)
    assert concatenated_dataset[0]["image"] == dset1[0]["image"]
    assert concatenated_dataset[1]["image"] == dset2[0]["image"]
示例#11
0
def run_sparse_retrieval(datasets, training_args):
    #### retreival process ####

    retriever = SparseRetrieval(tokenize_fn=tokenize,
                                data_path="./data",
                                context_path="wikipedia_documents.json"
                                # context_path="all_wikipedia_documents.json"
                                )
    # sparse embedding retrieval
    # retriever.get_sparse_embedding()
    #df = retriever.retrieve(datasets['validation'])

    # bm25 retrieval
    # retriever.get_embedding_BM25()
    # df = retriever.retrieve_BM25(query_or_dataset=datasets['validation'], topk=10)

    # elastic search retrieval
    # retriever.get_elastic_search()
    df = retriever.retrieve_ES(query_or_dataset=datasets['validation'],
                               topk=10)

    # faiss retrieval
    # df = retriever.retrieve_faiss(dataset['validation'])

    if training_args.do_predict:  # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다.
        f = Features({
            'context': Value(dtype='string', id=None),
            'id': Value(dtype='string', id=None),
            'question': Value(dtype='string', id=None)
        })

    elif training_args.do_eval:  # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다.
        f = Features({
            'answers':
            Sequence(feature={
                'text': Value(dtype='string', id=None),
                'answer_start': Value(dtype='int32', id=None)
            },
                     length=-1,
                     id=None),
            'context':
            Value(dtype='string', id=None),
            'id':
            Value(dtype='string', id=None),
            'question':
            Value(dtype='string', id=None)
        })

    datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)})
    return datasets
示例#12
0
def run_sparse_retrieval(datasets, training_args, inf_args):
    #### retreival process ####
    if inf_args.retrieval == None:
        retriever = SparseRetrieval_BM25PLUS(
            tokenize_fn=tokenize,
            data_path="./data",
            context_path="wikipedia_documents.json")
    elif inf_args.retrieval.lower() == "sparse":
        retriever = SparseRetrieval(tokenize_fn=tokenize,
                                    data_path="./data",
                                    context_path="wikipedia_documents.json")
    # elif inf_args.retrieval.lower() == "bm25" or inf_args.retrieval.lower() == "bm25":
    #     retriever = SparseRetrieval_BM25(tokenize_fn=tokenize,
    #                                          data_path="./data",
    #                                          context_path="wikipedia_documents.json")

    retriever.get_sparse_embedding()
    df = retriever.retrieve(datasets['validation'], inf_args.k)

    # faiss retrieval

    # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다.
    if training_args.do_predict:
        f = Features({
            'contexts': Value(dtype='string', id=None),
            'id': Value(dtype='string', id=None),
            'question': Value(dtype='string', id=None)
        })

    # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다.
    elif training_args.do_eval:
        f = Features({
            'answers':
            Sequence(feature={
                'text': Value(dtype='string', id=None),
                'answer_start': Value(dtype='int32', id=None)
            },
                     length=-1,
                     id=None),
            'context':
            Value(dtype='string', id=None),
            'id':
            Value(dtype='string', id=None),
            'question':
            Value(dtype='string', id=None)
        })

    datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)})
    return datasets
示例#13
0
def load_datasets(lang="es", random_state=2021, preprocessing_args={}):
    """
    Load emotion recognition datasets
    """

    train_df = load_df(paths[lang]["train"])
    test_df = load_df(paths[lang]["test"])
    train_df, dev_df = train_test_split(train_df,
                                        stratify=train_df["label"],
                                        random_state=random_state)

    for df in [train_df, dev_df, test_df]:
        for label, idx in label2id.items():
            df.loc[df["label"] == label, "label"] = idx
        df["label"] = df["label"].astype(int)

    preprocess = lambda x: preprocess_tweet(x, lang=lang, **preprocessing_args)

    train_df.loc[:, "text"] = train_df["text"].apply(preprocess)
    dev_df.loc[:, "text"] = dev_df["text"].apply(preprocess)
    test_df.loc[:, "text"] = test_df["text"].apply(preprocess)

    features = Features({
        'text':
        Value('string'),
        'label':
        ClassLabel(num_classes=len(id2label),
                   names=[id2label[k] for k in sorted(id2label.keys())])
    })

    train_dataset = Dataset.from_pandas(train_df, features=features)
    dev_dataset = Dataset.from_pandas(dev_df, features=features)
    test_dataset = Dataset.from_pandas(test_df, features=features)

    return train_dataset, dev_dataset, test_dataset
示例#14
0
def test_dataset_with_image_feature_from_np_array():
    import PIL.Image

    image_array = np.arange(640 * 480, dtype=np.uint8).reshape(480, 640)
    data = {"image": [image_array]}
    features = Features({"image": Image()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"image"}
    assert isinstance(item["image"], PIL.Image.Image)
    np.testing.assert_array_equal(np.array(item["image"]), image_array)
    assert item["image"].filename == ""
    assert item["image"].format == "PNG"
    assert item["image"].size == (640, 480)
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"image"}
    assert isinstance(batch["image"], list) and all(isinstance(item, PIL.Image.Image) for item in batch["image"])
    np.testing.assert_array_equal(np.array(batch["image"][0]), image_array)
    assert batch["image"][0].filename == ""
    assert batch["image"][0].format == "PNG"
    assert batch["image"][0].size == (640, 480)
    column = dset["image"]
    assert len(column) == 1
    assert isinstance(column, list) and all(isinstance(item, PIL.Image.Image) for item in column)
    np.testing.assert_array_equal(np.array(column[0]), image_array)
    assert column[0].filename == ""
    assert column[0].format == "PNG"
    assert column[0].size == (640, 480)
示例#15
0
    def test_push_dataset_to_hub_custom_features_image(self):
        image_path = os.path.join(os.path.dirname(__file__), "features",
                                  "data", "test_image_rgb.jpg")
        data = {"x": [image_path, None], "y": [0, -1]}
        features = Features({"x": Image(), "y": Value("int32")})
        ds = Dataset.from_dict(data, features=features)

        for embed_external_files in [True, False]:
            ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
            try:
                ds.push_to_hub(ds_name,
                               embed_external_files=embed_external_files,
                               token=self._token)
                hub_ds = load_dataset(ds_name,
                                      split="train",
                                      download_mode="force_redownload")

                self.assertListEqual(ds.column_names, hub_ds.column_names)
                self.assertListEqual(list(ds.features.keys()),
                                     list(hub_ds.features.keys()))
                self.assertDictEqual(ds.features, hub_ds.features)
                self.assertEqual(ds[:], hub_ds[:])
                hub_ds = hub_ds.cast_column("x", Image(decode=False))
                elem = hub_ds[0]["x"]
                path, bytes_ = elem["path"], elem["bytes"]
                self.assertTrue(bool(path) == (not embed_external_files))
                self.assertTrue(bool(bytes_) == embed_external_files)
            finally:
                self._api.delete_repo(ds_name.split("/")[1],
                                      organization=ds_name.split("/")[0],
                                      token=self._token,
                                      repo_type="dataset")
示例#16
0
    def test_push_dataset_to_hub_custom_features_audio(self):
        audio_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_audio_44100.wav")
        data = {"x": [audio_path, None], "y": [0, -1]}
        features = Features({"x": Audio(), "y": Value("int32")})
        ds = Dataset.from_dict(data, features=features)

        for embed_external_files in [True, False]:
            ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
            try:
                ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token)
                hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload")

                self.assertListEqual(ds.column_names, hub_ds.column_names)
                self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys()))
                self.assertDictEqual(ds.features, hub_ds.features)
                np.testing.assert_equal(ds[0]["x"]["array"], hub_ds[0]["x"]["array"])
                self.assertEqual(
                    ds[1], hub_ds[1]
                )  # don't test hub_ds[0] since audio decoding might be slightly different
                hub_ds = hub_ds.cast_column("x", Audio(decode=False))
                elem = hub_ds[0]["x"]
                path, bytes_ = elem["path"], elem["bytes"]
                self.assertTrue(bool(path) == (not embed_external_files))
                self.assertTrue(bool(bytes_) == embed_external_files)
            finally:
                self.cleanup_repo(ds_name)
def build_dataset(df, tokenizer, batch_size):
    features = Features({
        'id': Value('uint64'),
        'context': Value('string'),
        'text': Value('string'),
    })

    dataset = Dataset.from_pandas(df, features=features)

    dataset = dataset.map(
        lambda x: tokenizer(x["text"],
                            x["context"],
                            padding="longest",
                            truncation='longest_first'),
        batched=True,
        batch_size=batch_size,
    )

    def format_dataset(dataset):
        dataset.set_format(
            type='torch',
            columns=['input_ids', 'token_type_ids', 'attention_mask'])
        return dataset

    dataset = format_dataset(dataset)

    return dataset
示例#18
0
def test_dataset_with_image_feature_tar_jpg(tar_jpg_path):
    import PIL.Image

    data = {"image": []}
    for file_path, file_obj in iter_archive(tar_jpg_path):
        data["image"].append({"path": file_path, "bytes": file_obj.read()})
        break

    features = Features({"image": Image()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"image"}
    assert isinstance(item["image"], PIL.Image.Image)
    assert item["image"].filename == ""
    assert item["image"].format == "JPEG"
    assert item["image"].size == (640, 480)
    assert item["image"].mode == "RGB"
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"image"}
    assert isinstance(batch["image"], list) and all(isinstance(item, PIL.Image.Image) for item in batch["image"])
    assert batch["image"][0].filename == ""
    assert batch["image"][0].format == "JPEG"
    assert batch["image"][0].size == (640, 480)
    assert batch["image"][0].mode == "RGB"
    column = dset["image"]
    assert len(column) == 1
    assert isinstance(column, list) and all(isinstance(item, PIL.Image.Image) for item in column)
    assert column[0].filename == ""
    assert column[0].format == "JPEG"
    assert column[0].size == (640, 480)
    assert column[0].mode == "RGB"
示例#19
0
文件: test_hf.py 项目: TakeLab/podium
def complex_dataset():
    features = {
        "translation": Translation(languages=("en", "fr")),
        "sentiment": ClassLabel(num_classes=2),
    }

    return datasets.Dataset.from_dict(COMPLEX_DATA, Features(features))
示例#20
0
    def test_push_dataset_dict_to_hub_custom_features(self):
        features = Features({
            "x": Value("int64"),
            "y": ClassLabel(names=["neg", "pos"])
        })
        ds = Dataset.from_dict({
            "x": [1, 2, 3],
            "y": [0, 0, 1]
        },
                               features=features)

        local_ds = DatasetDict({"test": ds})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            local_ds.push_to_hub(ds_name, token=self._token)
            hub_ds = load_dataset(ds_name, download_mode="force_redownload")

            self.assertDictEqual(local_ds.column_names, hub_ds.column_names)
            self.assertListEqual(list(local_ds["test"].features.keys()),
                                 list(hub_ds["test"].features.keys()))
            self.assertDictEqual(local_ds["test"].features,
                                 hub_ds["test"].features)
        finally:
            self._api.delete_repo(ds_name.split("/")[1],
                                  organization=ds_name.split("/")[0],
                                  token=self._token,
                                  repo_type="dataset")
def save_data(train_df, val_df):
    train_f = Features({
        'answers':
        Sequence(feature={
            'text': Value(dtype='string', id=None),
            'answer_start': Value(dtype='int32', id=None)
        },
                 length=-1,
                 id=None),
        'context':
        Value(dtype='string', id=None),
        'id':
        Value(dtype='string', id=None),
        'question':
        Value(dtype='string', id=None),
        'question_type':
        Value(dtype='int32', id=None)
    })

    train_datasets = DatasetDict({
        'train':
        Dataset.from_pandas(train_df, features=train_f),
        'validation':
        Dataset.from_pandas(val_df, features=train_f)
    })
    file = open("../../data/question_type.pkl", "wb")
    pickle.dump(train_datasets, file)
    file.close()
示例#22
0
def test_dataset_with_image_feature(shared_datadir):
    import PIL.Image

    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = {"image": [image_path]}
    features = Features({"image": Image()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"image"}
    assert isinstance(item["image"], PIL.Image.Image)
    assert os.path.samefile(item["image"].filename, image_path)
    assert item["image"].format == "JPEG"
    assert item["image"].size == (640, 480)
    assert item["image"].mode == "RGB"
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"image"}
    assert isinstance(batch["image"], list) and all(isinstance(item, PIL.Image.Image) for item in batch["image"])
    assert os.path.samefile(batch["image"][0].filename, image_path)
    assert batch["image"][0].format == "JPEG"
    assert batch["image"][0].size == (640, 480)
    assert batch["image"][0].mode == "RGB"
    column = dset["image"]
    assert len(column) == 1
    assert isinstance(column, list) and all(isinstance(item, PIL.Image.Image) for item in column)
    assert os.path.samefile(column[0].filename, image_path)
    assert column[0].format == "JPEG"
    assert column[0].size == (640, 480)
    assert column[0].mode == "RGB"
示例#23
0
def test_formatted_dataset_with_image_feature_undecoded(shared_datadir):
    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = {"image": [image_path]}
    features = Features({"image": Image(decode=False)})
    dset = Dataset.from_dict(data, features=features)
    with dset.formatted_as("numpy"):
        item = dset[0]
        assert item.keys() == {"image"}
        assert item["image"] == {"path": image_path, "bytes": None}
        batch = dset[:1]
        assert batch.keys() == {"image"}
        assert len(batch["image"]) == 1
        assert batch["image"][0] == {"path": image_path, "bytes": None}
        column = dset["image"]
        assert len(column) == 1
        assert column[0] == {"path": image_path, "bytes": None}

    with dset.formatted_as("pandas"):
        item = dset[0]
        assert item.shape == (1, 1)
        assert item.columns == ["image"]
        assert item["image"][0] == {"path": image_path, "bytes": None}
        batch = dset[:1]
        assert batch.shape == (1, 1)
        assert batch.columns == ["image"]
        assert batch["image"][0] == {"path": image_path, "bytes": None}
        column = dset["image"]
        assert len(column) == 1
        assert column[0] == {"path": image_path, "bytes": None}
示例#24
0
 def _set_features(self):
     """Set the features of the dataset."""
     with self.format():
         self.info.features = Features.from_arrow_schema(
             pa.Table.from_pydict(
                 self[:1],
             ).schema
         )
示例#25
0
def get_etr_dataset(args):
    etr_path = p.join(args.path.train_data_dir, "etr_qa_dataset.json")

    if not p.exists(etr_path):
        raise FileNotFoundError(
            f"ETRI 데이터 셋 {etr_path}로 파일명 바꿔서 데이터 넣어주시길 바랍니다.")

    with open(etr_path, "r") as f:
        etr_dict = json.load(f)

    #  print(etr_dict["data"][0])
    new_dataset = defaultdict(list)

    cnt = 0

    for datas in etr_dict["data"]:
        title = datas["title"]
        context = datas["paragraphs"][0]["context"]

        for questions in datas["paragraphs"][0]["qas"]:
            question = questions["question"]
            answers = {
                "answer_start": [questions["answers"][0]["answer_start"]],
                "text": [questions["answers"][0]["text"]],
            }

            new_dataset["id"].append(f"etr-custom-{cnt}")
            new_dataset["title"].append(title)
            new_dataset["context"].append(context)
            new_dataset["question"].append(question)
            new_dataset["answers"].append(answers)

            cnt += 1

    f = Features({
        "answers":
        Sequence(
            feature={
                "text": Value(dtype="string", id=None),
                "answer_start": Value(dtype="int32", id=None)
            },
            length=-1,
            id=None,
        ),
        "id":
        Value(dtype="string", id=None),
        "context":
        Value(dtype="string", id=None),
        "question":
        Value(dtype="string", id=None),
        "title":
        Value(dtype="string", id=None),
    })

    df = pd.DataFrame(new_dataset)
    etr_dataset = Dataset.from_pandas(df, features=f)

    return etr_dataset
示例#26
0
def test_dataset_with_image_feature_map(shared_datadir):
    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = {"image": [image_path], "caption": ["cats sleeping"]}
    features = Features({"image": Image(), "caption": Value("string")})
    dset = Dataset.from_dict(data, features=features)

    for item in dset:
        assert item.keys() == {"image", "caption"}
        assert item == {
            "image": {
                "path": image_path,
                "bytes": None
            },
            "caption": "cats sleeping"
        }

    # no decoding

    def process_caption(example):
        example["caption"] = "Two " + example["caption"]
        return example

    processed_dset = dset.map(process_caption)
    for item in processed_dset:
        assert item.keys() == {"image", "caption"}
        assert item == {
            "image": {
                "path": image_path,
                "bytes": None
            },
            "caption": "Two cats sleeping"
        }

    # decoding example

    def process_image_by_example(example):
        example["mode"] = example["image"].mode
        return example

    decoded_dset = dset.map(process_image_by_example)
    for item in decoded_dset:
        assert item.keys() == {"image", "caption", "mode"}
        assert os.path.samefile(item["image"]["path"], image_path)
        assert item["caption"] == "cats sleeping"
        assert item["mode"] == "RGB"

    # decoding batch

    def process_image_by_batch(batch):
        batch["mode"] = [image.mode for image in batch["image"]]
        return batch

    decoded_dset = dset.map(process_image_by_batch, batched=True)
    for item in decoded_dset:
        assert item.keys() == {"image", "caption", "mode"}
        assert os.path.samefile(item["image"]["path"], image_path)
        assert item["caption"] == "cats sleeping"
        assert item["mode"] == "RGB"
    def test_caching(self):
        n_rows = 10

        features = Features({"foo": Value("string"), "bar": Value("string")})

        with tempfile.TemporaryDirectory() as tmp_dir:
            # Use \n for newline. Windows automatically adds the \r when writing the file
            # see https://docs.python.org/3/library/os.html#os.linesep
            open(os.path.join(tmp_dir, "table.csv"), "w",
                 encoding="utf-8").write("\n".join(",".join(["foo", "bar"])
                                                   for _ in range(n_rows + 1)))
            ds = load_dataset(
                "csv",
                data_files=os.path.join(tmp_dir, "table.csv"),
                cache_dir=tmp_dir,
                split="train",
                keep_in_memory=False,
            )
            data_file = ds.cache_files[0]["filename"]
            fingerprint = ds._fingerprint
            self.assertEqual(len(ds), n_rows)
            del ds
            ds = load_dataset(
                "csv",
                data_files=os.path.join(tmp_dir, "table.csv"),
                cache_dir=tmp_dir,
                split="train",
                keep_in_memory=False,
            )
            self.assertEqual(ds.cache_files[0]["filename"], data_file)
            self.assertEqual(ds._fingerprint, fingerprint)
            del ds
            ds = load_dataset(
                "csv",
                data_files=os.path.join(tmp_dir, "table.csv"),
                cache_dir=tmp_dir,
                split="train",
                features=features,
                keep_in_memory=False,
            )
            self.assertNotEqual(ds.cache_files[0]["filename"], data_file)
            self.assertNotEqual(ds._fingerprint, fingerprint)
            del ds

            open(os.path.join(tmp_dir, "table.csv"), "w",
                 encoding="utf-8").write("\n".join(",".join(["Foo", "Bar"])
                                                   for _ in range(n_rows + 1)))
            ds = load_dataset(
                "csv",
                data_files=os.path.join(tmp_dir, "table.csv"),
                cache_dir=tmp_dir,
                split="train",
                keep_in_memory=False,
            )
            self.assertNotEqual(ds.cache_files[0]["filename"], data_file)
            self.assertNotEqual(ds._fingerprint, fingerprint)
            self.assertEqual(len(ds), n_rows)
            del ds
def test_parquet_datasetdict_reader_features(features, parquet_path, tmp_path):
    cache_dir = tmp_path / "cache"
    default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"}
    expected_features = features.copy() if features else default_expected_features
    features = (
        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None
    )
    dataset = ParquetDatasetReader({"train": parquet_path}, features=features, cache_dir=cache_dir).read()
    _check_parquet_datasetdict(dataset, expected_features)
def test_dataset_from_text_features(features, text_path, tmp_path):
    cache_dir = tmp_path / "cache"
    default_expected_features = {"text": "string"}
    expected_features = features.copy() if features else default_expected_features
    features = (
        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None
    )
    dataset = TextDatasetReader(text_path, features=features, cache_dir=cache_dir).read()
    _check_text_dataset(dataset, expected_features)
示例#30
0
def test_dataset_from_json_features(features, jsonl_path, tmp_path):
    cache_dir = tmp_path / "cache"
    default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"}
    expected_features = features.copy() if features else default_expected_features
    features = (
        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None
    )
    dataset = JsonDatasetReader(jsonl_path, features=features, cache_dir=cache_dir).read()
    _check_json_dataset(dataset, expected_features)