示例#1
0
def test_formatted_dataset_with_audio_feature_undecoded(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data = {"audio": [audio_path]}
    features = Features({"audio": Audio(decode=False)})
    dset = Dataset.from_dict(data, features=features)
    with dset.formatted_as("numpy"):
        item = dset[0]
        assert item.keys() == {"audio"}
        assert item["audio"] == {"path": audio_path, "bytes": None}
        batch = dset[:1]
        assert batch.keys() == {"audio"}
        assert len(batch["audio"]) == 1
        assert batch["audio"][0] == {"path": audio_path, "bytes": None}
        column = dset["audio"]
        assert len(column) == 1
        assert column[0] == {"path": audio_path, "bytes": None}

    with dset.formatted_as("pandas"):
        item = dset[0]
        assert item.shape == (1, 1)
        assert item.columns == ["audio"]
        assert item["audio"][0] == {"path": audio_path, "bytes": None}
        batch = dset[:1]
        assert batch.shape == (1, 1)
        assert batch.columns == ["audio"]
        assert batch["audio"][0] == {"path": audio_path, "bytes": None}
        column = dset["audio"]
        assert len(column) == 1
        assert column[0] == {"path": audio_path, "bytes": None}
示例#2
0
def test_dataset_with_audio_feature_map_is_decoded(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data = {"audio": [audio_path], "text": ["Hello"]}
    features = Features({"audio": Audio(), "text": Value("string")})
    dset = Dataset.from_dict(data, features=features)

    def process_audio_sampling_rate_by_example(example):
        example["double_sampling_rate"] = 2 * example["audio"]["sampling_rate"]
        return example

    decoded_dset = dset.map(process_audio_sampling_rate_by_example)
    for item in decoded_dset._iter(decoded=False):
        assert item.keys() == {"audio", "text", "double_sampling_rate"}
        assert item["double_sampling_rate"] == 88200

    def process_audio_sampling_rate_by_batch(batch):
        double_sampling_rates = []
        for audio in batch["audio"]:
            double_sampling_rates.append(2 * audio["sampling_rate"])
        batch["double_sampling_rate"] = double_sampling_rates
        return batch

    decoded_dset = dset.map(process_audio_sampling_rate_by_batch, batched=True)
    for item in decoded_dset._iter(decoded=False):
        assert item.keys() == {"audio", "text", "double_sampling_rate"}
        assert item["double_sampling_rate"] == 88200
    def test_push_dataset_dict_to_hub_name_without_namespace(self):
        ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})

        local_ds = DatasetDict({"train": ds})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            local_ds.push_to_hub(ds_name.split("/")[-1], token=self._token)
            hub_ds = load_dataset(ds_name, download_mode="force_redownload")

            self.assertDictEqual(local_ds.column_names, hub_ds.column_names)
            self.assertListEqual(list(local_ds["train"].features.keys()),
                                 list(hub_ds["train"].features.keys()))
            self.assertDictEqual(local_ds["train"].features,
                                 hub_ds["train"].features)

            # Ensure that there is a single file on the repository that has the correct name
            files = sorted(
                self._api.list_repo_files(ds_name, repo_type="dataset"))
            self.assertListEqual(files, [
                ".gitattributes", "data/train-00000-of-00001.parquet",
                "dataset_infos.json"
            ])
        finally:
            self._api.delete_repo(ds_name.split("/")[1],
                                  organization=ds_name.split("/")[0],
                                  repo_type="dataset")
示例#4
0
def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path):
    audio_filename = "test_audio_44100.mp3"
    data = {"audio": []}
    for file_path, file_obj in iter_archive(tar_mp3_path):
        data["audio"].append({"path": file_path, "bytes": file_obj.read()})
        break
    features = Features({"audio": Audio()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"audio"}
    assert item["audio"].keys() == {"path", "array", "sampling_rate"}
    assert item["audio"]["path"] == audio_filename
    assert item["audio"]["array"].shape == (109440, )
    assert item["audio"]["sampling_rate"] == 44100
    batch = dset[:1]
    assert batch.keys() == {"audio"}
    assert len(batch["audio"]) == 1
    assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"}
    assert batch["audio"][0]["path"] == audio_filename
    assert batch["audio"][0]["array"].shape == (109440, )
    assert batch["audio"][0]["sampling_rate"] == 44100
    column = dset["audio"]
    assert len(column) == 1
    assert column[0].keys() == {"path", "array", "sampling_rate"}
    assert column[0]["path"] == audio_filename
    assert column[0]["array"].shape == (109440, )
    assert column[0]["sampling_rate"] == 44100
    def test_push_dataset_to_hub(self):
        local_ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            local_ds.push_to_hub(ds_name, split="train", token=self._token)
            local_ds_dict = {"train": local_ds}
            hub_ds_dict = load_dataset(ds_name,
                                       download_mode="force_redownload")

            self.assertListEqual(list(local_ds_dict.keys()),
                                 list(hub_ds_dict.keys()))

            for ds_split_name in local_ds_dict.keys():
                local_ds = local_ds_dict[ds_split_name]
                hub_ds = hub_ds_dict[ds_split_name]
                self.assertListEqual(local_ds.column_names,
                                     hub_ds.column_names)
                self.assertListEqual(list(local_ds.features.keys()),
                                     list(hub_ds.features.keys()))
                self.assertDictEqual(local_ds.features, hub_ds.features)
        finally:
            self._api.delete_repo(ds_name.split("/")[1],
                                  organization=ds_name.split("/")[0],
                                  token=self._token,
                                  repo_type="dataset")
    def test_push_dataset_dict_to_hub_custom_features(self):
        features = Features({
            "x": Value("int64"),
            "y": ClassLabel(names=["neg", "pos"])
        })
        ds = Dataset.from_dict({
            "x": [1, 2, 3],
            "y": [0, 0, 1]
        },
                               features=features)

        local_ds = DatasetDict({"test": ds})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            local_ds.push_to_hub(ds_name, token=self._token)
            hub_ds = load_dataset(ds_name, download_mode="force_redownload")

            self.assertDictEqual(local_ds.column_names, hub_ds.column_names)
            self.assertListEqual(list(local_ds["test"].features.keys()),
                                 list(hub_ds["test"].features.keys()))
            self.assertDictEqual(local_ds["test"].features,
                                 hub_ds["test"].features)
        finally:
            self._api.delete_repo(ds_name.split("/")[1],
                                  organization=ds_name.split("/")[0],
                                  token=self._token,
                                  repo_type="dataset")
 def get_dummy_dataset(self):
     dataset = Dataset.from_dict({
         "id": ["0", "1"],
         "question": ["foo", "bar"],
         "answers": [["Foo", "Bar"], ["Bar"]],
     })
     return dataset
示例#8
0
def test_resampling_after_loading_dataset_with_audio_feature_mp3(
        shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.mp3")
    data = {"audio": [audio_path]}
    features = Features({"audio": Audio()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item["audio"]["sampling_rate"] == 44100
    dset = dset.cast_column("audio", Audio(sampling_rate=16000))
    item = dset[0]
    assert item.keys() == {"audio"}
    assert item["audio"].keys() == {"path", "array", "sampling_rate"}
    assert item["audio"]["path"] == audio_path
    assert item["audio"]["array"].shape == (39707, )
    assert item["audio"]["sampling_rate"] == 16000
    batch = dset[:1]
    assert batch.keys() == {"audio"}
    assert len(batch["audio"]) == 1
    assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"}
    assert batch["audio"][0]["path"] == audio_path
    assert batch["audio"][0]["array"].shape == (39707, )
    assert batch["audio"][0]["sampling_rate"] == 16000
    column = dset["audio"]
    assert len(column) == 1
    assert column[0].keys() == {"path", "array", "sampling_rate"}
    assert column[0]["path"] == audio_path
    assert column[0]["array"].shape == (39707, )
    assert column[0]["sampling_rate"] == 16000
    def predict(cls, path_to_model, input):
        """For the input, do the predictions and return them.
        Args:
            input (a pandas dataframe): The data on which to do the predictions. There will be
                one prediction per row in the dataframe"""

        trainer, config, data_args, tokenizer = cls.get_model(path_to_model)

        text_column_name = "text"
        label_column_name = "labels"
        bbox_columns_name = "bbox"
        pred_dataset = Dataset.from_dict(input)
        tokenized_datasets = pred_dataset.map(
            lambda x: preprocess_dataset(
                x, tokenizer, config.label2id, data_args["label_all_tokens"],
                "max_length"
                if data_args["pad_to_max_length"] else False, data_args[
                    "use_bbox"], data_args["task_name"]),
            #remove_columns=[label_column_name], #todo: check if label_column_name in dataset before removing it
            batched=True,
            num_proc=data_args["preprocessing_num_workers"],
            load_from_cache_file=not data_args["overwrite_cache"],
        )
        logger.info("Datasets %s", tokenized_datasets)
        logger.info("Column names %s", tokenized_datasets.column_names)
        logger.info("Sample example %s", tokenized_datasets[0])

        # Get predictions
        true_predictions = None
        probas = None
        predictions, labels, _ = trainer.predict(tokenized_datasets,
                                                 metric_key_prefix="pred")
        if data_args["task_name"] == "classif":
            true_predictions = [
                config.id2label[p] for p in np.argmax(predictions, axis=1)
            ]
            probas = np.amax(softmax(predictions, axis=1), axis=1).tolist()
        elif data_args["task_name"] == "multilabel-classif":
            predictions = 1 / (1 + np.exp(-predictions))  # sigmoid
            predictions = (predictions > 0.5)  # threshold
            true_predictions = [[
                config.id2label[i] for i in np.where(p == 1)[0]
            ] for p in predictions]
        elif data_args["task_name"] == "regression":
            true_predictions = np.squeeze(predictions)
        elif data_args["task_name"] == "ner":
            predictions = np.argmax(predictions, axis=2)
            true_predictions = [[
                config.id2label[p] for (p, l) in zip(prediction, label)
                if l != -100
            ] for prediction, label in zip(predictions, labels)]

        logger.info("true_predictions %s", true_predictions)
        result = {}
        if true_predictions is not None:
            result["pred"] = true_predictions
        if probas is not None:
            result["proba"] = probas
        return result
示例#10
0
def test_formatted_dataset_with_image_feature(shared_datadir):
    import PIL.Image

    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = {"image": [image_path, image_path]}
    features = Features({"image": Image()})
    dset = Dataset.from_dict(data, features=features)
    with dset.formatted_as("numpy"):
        item = dset[0]
        assert item.keys() == {"image"}
        assert isinstance(item["image"], PIL.Image.Image)
        assert os.path.samefile(item["image"].filename, image_path)
        assert item["image"].format == "JPEG"
        assert item["image"].size == (640, 480)
        assert item["image"].mode == "RGB"
        batch = dset[:1]
        assert batch.keys() == {"image"}
        assert len(batch) == 1
        assert isinstance(batch["image"], list) and all(
            isinstance(item, PIL.Image.Image) for item in batch["image"])
        assert os.path.samefile(batch["image"][0].filename, image_path)
        assert batch["image"][0].format == "JPEG"
        assert batch["image"][0].size == (640, 480)
        assert batch["image"][0].mode == "RGB"
        column = dset["image"]
        assert len(column) == 2
        assert isinstance(column, list) and all(
            isinstance(item, PIL.Image.Image) for item in column)
        assert os.path.samefile(column[0].filename, image_path)
        assert column[0].format == "JPEG"
        assert column[0].size == (640, 480)
        assert column[0].mode == "RGB"

    with dset.formatted_as("pandas"):
        item = dset[0]
        assert item.shape == (1, 1)
        assert item.columns == ["image"]
        assert isinstance(item["image"][0], PIL.Image.Image)
        assert os.path.samefile(item["image"][0].filename, image_path)
        assert item["image"][0].format == "JPEG"
        assert item["image"][0].size == (640, 480)
        assert item["image"][0].mode == "RGB"
        batch = dset[:1]
        assert batch.shape == (1, 1)
        assert batch.columns == ["image"]
        assert isinstance(batch["image"], pd.Series) and all(
            isinstance(item, PIL.Image.Image) for item in batch["image"])
        assert os.path.samefile(batch["image"][0].filename, image_path)
        assert batch["image"][0].format == "JPEG"
        assert batch["image"][0].size == (640, 480)
        assert batch["image"][0].mode == "RGB"
        column = dset["image"]
        assert len(column) == 2
        assert isinstance(column, pd.Series) and all(
            isinstance(item, PIL.Image.Image) for item in column)
        assert os.path.samefile(column[0].filename, image_path)
        assert column[0].format == "JPEG"
        assert column[0].size == (640, 480)
        assert column[0].mode == "RGB"
def get_wrong_dataset_TUWS():

    labeled, _ = correct_examples()

    unlabeled = Dataset.from_dict({
        'sentence': ['moon what??.', 'I am people'],
        'label': [-1, 0]
    })

    train_dic = Dataset.from_dict({
        'sentence':
        labeled['sentence'] + unlabeled['sentence'],
        'label':
        labeled['label'] + unlabeled['label']
    })

    return DatasetDict({'train': train_dic})
示例#12
0
def add_chinese_references(dataset, ref_file):
    with open(ref_file, "r", encoding="utf-8") as f:
        refs = [json.loads(line) for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
    assert len(dataset) == len(refs)

    dataset_dict = {c: dataset[c] for c in dataset.column_names}
    dataset_dict["chinese_ref"] = refs
    return Dataset.from_dict(dataset_dict)
示例#13
0
 def load_data(
     self,
     inputs: List[str],
     targets: Optional[List[Any]] = None,
     target_formatter: Optional[TargetFormatter] = None,
 ) -> Dataset:
     if targets is not None:
         hf_dataset = Dataset.from_dict({
             DataKeys.INPUT: inputs,
             DataKeys.TARGET: targets
         })
     else:
         hf_dataset = Dataset.from_dict({DataKeys.INPUT: inputs})
     return super().load_data(hf_dataset,
                              DataKeys.INPUT,
                              DataKeys.TARGET,
                              target_formatter=target_formatter)
def get_dataset():
    data_dict = {
        "repo_name": ["test_repo1", "test_repo2", "test_repo3"],
        "path": ["test_1.py", "test_2.py", "unit_test.py"],
        "content": ["a " * 20, "a " * 30, "b " * 7],
    }
    dataset = Dataset.from_dict(data_dict)
    return dataset
示例#15
0
def test_dataset_with_image_feature_map(shared_datadir):
    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = {"image": [image_path], "caption": ["cats sleeping"]}
    features = Features({"image": Image(), "caption": Value("string")})
    dset = Dataset.from_dict(data, features=features)

    for item in dset:
        assert item.keys() == {"image", "caption"}
        assert item == {
            "image": {
                "path": image_path,
                "bytes": None
            },
            "caption": "cats sleeping"
        }

    # no decoding

    def process_caption(example):
        example["caption"] = "Two " + example["caption"]
        return example

    processed_dset = dset.map(process_caption)
    for item in processed_dset:
        assert item.keys() == {"image", "caption"}
        assert item == {
            "image": {
                "path": image_path,
                "bytes": None
            },
            "caption": "Two cats sleeping"
        }

    # decoding example

    def process_image_by_example(example):
        example["mode"] = example["image"].mode
        return example

    decoded_dset = dset.map(process_image_by_example)
    for item in decoded_dset:
        assert item.keys() == {"image", "caption", "mode"}
        assert os.path.samefile(item["image"]["path"], image_path)
        assert item["caption"] == "cats sleeping"
        assert item["mode"] == "RGB"

    # decoding batch

    def process_image_by_batch(batch):
        batch["mode"] = [image.mode for image in batch["image"]]
        return batch

    decoded_dset = dset.map(process_image_by_batch, batched=True)
    for item in decoded_dset:
        assert item.keys() == {"image", "caption", "mode"}
        assert os.path.samefile(item["image"]["path"], image_path)
        assert item["caption"] == "cats sleeping"
        assert item["mode"] == "RGB"
示例#16
0
def test_dataset_concatenate_nested_image_features(shared_datadir):
    # we use a different data structure between 1 and 2 to make sure they are compatible with each other
    image_path = str(shared_datadir / "test_image_rgb.jpg")
    features = Features({"list_of_structs_of_images": [{"image": Image()}]})
    data1 = {"list_of_structs_of_images": [[{"image": image_path}]]}
    dset1 = Dataset.from_dict(data1, features=features)
    data2 = {"list_of_structs_of_images": [[{"image": {"bytes": open(image_path, "rb").read()}}]]}
    dset2 = Dataset.from_dict(data2, features=features)
    concatenated_dataset = concatenate_datasets([dset1, dset2])
    assert len(concatenated_dataset) == len(dset1) + len(dset2)
    assert (
        concatenated_dataset[0]["list_of_structs_of_images"][0]["image"]
        == dset1[0]["list_of_structs_of_images"][0]["image"]
    )
    assert (
        concatenated_dataset[1]["list_of_structs_of_images"][0]["image"]
        == dset2[0]["list_of_structs_of_images"][0]["image"]
    )
def test_dataset_cast_to_audio_features(shared_datadir, build_data):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data = build_data(audio_path)
    dset = Dataset.from_dict(data)
    item = dset.cast(Features({"audio": Audio()}))[0]
    assert item.keys() == {"audio"}
    assert item["audio"].keys() == {"path", "array", "sampling_rate"}
    item = dset.cast_column("audio", Audio())[0]
    assert item.keys() == {"audio"}
    assert item["audio"].keys() == {"path", "array", "sampling_rate"}
示例#18
0
def load_dataset_from_disk(args):
    datas = DatasetDict()
    if args.train_file is not None and args.train:
        train_json = json.load(open(args.train_file))
        train_json, dev_json = shuffle_data(train_json, args.split)
        datas["train"] = Dataset.from_dict(convert_data_structure(train_json))
        datas["dev"] = Dataset.from_dict(convert_data_structure(dev_json))
        logger.info(f"Load train data number: {datas['train'].num_rows}")
        logger.info(f"Load dev data number: {datas['dev'].num_rows}")
    if args.eval and not args.train:
        train_json = json.load(open(args.train_file))
        train_json, dev_json = shuffle_data(train_json, args.split)
        datas['dev'] = Dataset.from_dict(convert_data_structure(dev_json))
        logger.info(f"Load dev data number: {datas['dev'].num_rows}")
    if args.predict_file is not None and args.predict:
        predict_json = json.load(open(args.predict_file))
        datas["predict"] = Dataset.from_dict(convert_data_structure(predict_json, ispredict=True))
        logger.info(f"Load predict data number: {datas['predict'].num_rows}")
    return datas
示例#19
0
 def get_dummy_dataset(self):
     dataset = Dataset.from_dict(
         {
             "id": ["0", "1"],
             "text": ["foo", "bar"],
             "title": ["Foo", "Bar"],
             "embeddings": [np.ones(self.retrieval_vector_size), 2 * np.ones(self.retrieval_vector_size)],
         }
     )
     dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT)
     return dataset
示例#20
0
    def load_data(self,
                  data: str,
                  dataset: Optional[Any] = None) -> "datasets.Dataset":
        stage = self._running_stage.value

        file_path = data

        path = Path(file_path)
        with open(path, "rb") as f:
            squad_v_2_dict = json.load(f)

        ids = []
        titles = []
        contexts = []
        questions = []
        answers = []
        for topic in squad_v_2_dict["data"]:
            title = topic["title"]
            for comprehension in topic["paragraphs"]:
                context = comprehension["context"]
                for qa in comprehension["qas"]:
                    question = qa["question"]
                    id = qa["id"]

                    _answer_starts = [
                        answer["answer_start"] for answer in qa["answers"]
                    ]
                    _answers = [answer["text"] for answer in qa["answers"]]

                    ids.append(id)
                    titles.append(title)
                    contexts.append(context)
                    questions.append(question)
                    answers.append(
                        dict(text=_answers, answer_start=_answer_starts))

        dataset_dict = DatasetDict({
            stage:
            Dataset.from_dict({
                "id": ids,
                "title": titles,
                "context": contexts,
                "question": questions,
                "answer": answers
            })
        })

        column_names = dataset_dict[stage].column_names

        dataset_dict = dataset_dict.map(self._tokenize_fn,
                                        batched=True,
                                        remove_columns=column_names)

        return dataset_dict[stage]
示例#21
0
def test_dataset_cast_to_image_features(shared_datadir, build_data):
    import PIL.Image

    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = build_data(image_path)
    dset = Dataset.from_dict(data)
    item = dset.cast(Features({"image": Image()}))[0]
    assert item.keys() == {"image"}
    assert isinstance(item["image"], PIL.Image.Image)
    item = dset.cast_column("image", Image())[0]
    assert item.keys() == {"image"}
    assert isinstance(item["image"], PIL.Image.Image)
def test_dataset_concatenate_nested_audio_features(shared_datadir):
    # we use a different data structure between 1 and 2 to make sure they are compatible with each other
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    features = Features({"list_of_structs_of_audios": [{"audio": Audio()}]})
    data1 = {"list_of_structs_of_audios": [[{"audio": audio_path}]]}
    dset1 = Dataset.from_dict(data1, features=features)
    data2 = {
        "list_of_structs_of_audios": [[{
            "audio": {
                "bytes": open(audio_path, "rb").read()
            }
        }]]
    }
    dset2 = Dataset.from_dict(data2, features=features)
    concatenated_dataset = concatenate_datasets([dset1, dset2])
    assert len(concatenated_dataset) == len(dset1) + len(dset2)
    assert (concatenated_dataset[0]["list_of_structs_of_audios"][0]["audio"]
            ["array"].shape == dset1[0]["list_of_structs_of_audios"][0]
            ["audio"]["array"].shape)
    assert (concatenated_dataset[1]["list_of_structs_of_audios"][0]["audio"]
            ["array"].shape == dset2[0]["list_of_structs_of_audios"][0]
            ["audio"]["array"].shape)
示例#23
0
 def load_data(
     self,
     data: Dict[str, Any],
     question_column_name: str = "question",
     context_column_name: str = "context",
     answer_column_name: str = "answer",
 ) -> Dataset:
     return super().load_data(
         Dataset.from_dict(data),
         question_column_name=question_column_name,
         context_column_name=context_column_name,
         answer_column_name=answer_column_name,
     )
def test_enable_disable_progress_bar():
    dset = Dataset.from_dict({"col_1": [3, 2, 0, 1]})

    with patch("tqdm.auto.tqdm") as mock_tqdm:
        datasets.disable_progress_bar()
        dset.map(lambda x: {"col_2": x["col_1"] + 1})
        mock_tqdm.assert_not_called()

        mock_tqdm.reset_mock()

        datasets.enable_progress_bar()
        dset.map(lambda x: {"col_2": x["col_1"] + 1})
        mock_tqdm.assert_called()
示例#25
0
    def test_push_dataset_to_hub_custom_splits(self):
        ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            ds.push_to_hub(ds_name, split="random", token=self._token)
            hub_ds = load_dataset(ds_name, download_mode="force_redownload")

            self.assertListEqual(ds.column_names, hub_ds["random"].column_names)
            self.assertListEqual(list(ds.features.keys()), list(hub_ds["random"].features.keys()))
            self.assertDictEqual(ds.features, hub_ds["random"].features)
        finally:
            self.cleanup_repo(ds_name)
示例#26
0
    def load_data(self,
                  data: Any,
                  columns: List[str] = None) -> "datasets.Dataset":
        stage = self._running_stage.value

        dataset_dict = DatasetDict({stage: Dataset.from_dict(data)})

        column_names = dataset_dict[stage].column_names

        dataset_dict = dataset_dict.map(self._tokenize_fn,
                                        batched=True,
                                        remove_columns=column_names)

        return dataset_dict[stage]
def get_correct_dataset_TUWS(wrong_key=False):

    labeled, unlabeled = correct_examples()

    train_dic = Dataset.from_dict({
        'sentence':
        labeled['sentence'] + unlabeled['sentence'],
        'label':
        labeled['label'] + unlabeled['label']
    })

    if wrong_key is False:
        return DatasetDict({'train': train_dic})
    else:
        return DatasetDict({'training_Data': train_dic})
示例#28
0
def test_dataset_with_image_feature_undecoded(shared_datadir):
    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = {"image": [image_path]}
    features = Features({"image": Image(decode=False)})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"image"}
    assert item["image"] == {"path": image_path, "bytes": None}
    batch = dset[:1]
    assert batch.keys() == {"image"}
    assert len(batch["image"]) == 1
    assert batch["image"][0] == {"path": image_path, "bytes": None}
    column = dset["image"]
    assert len(column) == 1
    assert column[0] == {"path": image_path, "bytes": None}
示例#29
0
    def test_push_dataset_to_hub_custom_features(self):
        features = Features({"x": Value("int64"), "y": ClassLabel(names=["neg", "pos"])})
        ds = Dataset.from_dict({"x": [1, 2, 3], "y": [0, 0, 1]}, features=features)

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            ds.push_to_hub(ds_name, token=self._token)
            hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload")

            self.assertListEqual(ds.column_names, hub_ds.column_names)
            self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys()))
            self.assertDictEqual(ds.features, hub_ds.features)
            self.assertEqual(ds[:], hub_ds[:])
        finally:
            self.cleanup_repo(ds_name)
示例#30
0
def test_dataset_with_image_feature_map_undecoded(shared_datadir):
    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = {"image": [image_path]}
    features = Features({"image": Image(decode=False)})
    dset = Dataset.from_dict(data, features=features)

    def assert_image_example_undecoded(example):
        assert example["image"] == {"path": image_path, "bytes": None}

    dset.map(assert_image_example_undecoded)

    def assert_image_batch_undecoded(batch):
        for image in batch["image"]:
            assert image == {"path": image_path, "bytes": None}

    dset.map(assert_image_batch_undecoded, batched=True)