def test_formatted_dataset_with_audio_feature_undecoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} features = Features({"audio": Audio(decode=False)}) dset = Dataset.from_dict(data, features=features) with dset.formatted_as("numpy"): item = dset[0] assert item.keys() == {"audio"} assert item["audio"] == {"path": audio_path, "bytes": None} batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert batch["audio"][0] == {"path": audio_path, "bytes": None} column = dset["audio"] assert len(column) == 1 assert column[0] == {"path": audio_path, "bytes": None} with dset.formatted_as("pandas"): item = dset[0] assert item.shape == (1, 1) assert item.columns == ["audio"] assert item["audio"][0] == {"path": audio_path, "bytes": None} batch = dset[:1] assert batch.shape == (1, 1) assert batch.columns == ["audio"] assert batch["audio"][0] == {"path": audio_path, "bytes": None} column = dset["audio"] assert len(column) == 1 assert column[0] == {"path": audio_path, "bytes": None}
def test_dataset_with_audio_feature_map_is_decoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path], "text": ["Hello"]} features = Features({"audio": Audio(), "text": Value("string")}) dset = Dataset.from_dict(data, features=features) def process_audio_sampling_rate_by_example(example): example["double_sampling_rate"] = 2 * example["audio"]["sampling_rate"] return example decoded_dset = dset.map(process_audio_sampling_rate_by_example) for item in decoded_dset._iter(decoded=False): assert item.keys() == {"audio", "text", "double_sampling_rate"} assert item["double_sampling_rate"] == 88200 def process_audio_sampling_rate_by_batch(batch): double_sampling_rates = [] for audio in batch["audio"]: double_sampling_rates.append(2 * audio["sampling_rate"]) batch["double_sampling_rate"] = double_sampling_rates return batch decoded_dset = dset.map(process_audio_sampling_rate_by_batch, batched=True) for item in decoded_dset._iter(decoded=False): assert item.keys() == {"audio", "text", "double_sampling_rate"} assert item["double_sampling_rate"] == 88200
def test_push_dataset_dict_to_hub_name_without_namespace(self): ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) local_ds = DatasetDict({"train": ds}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: local_ds.push_to_hub(ds_name.split("/")[-1], token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") self.assertDictEqual(local_ds.column_names, hub_ds.column_names) self.assertListEqual(list(local_ds["train"].features.keys()), list(hub_ds["train"].features.keys())) self.assertDictEqual(local_ds["train"].features, hub_ds["train"].features) # Ensure that there is a single file on the repository that has the correct name files = sorted( self._api.list_repo_files(ds_name, repo_type="dataset")) self.assertListEqual(files, [ ".gitattributes", "data/train-00000-of-00001.parquet", "dataset_infos.json" ]) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], repo_type="dataset")
def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): audio_filename = "test_audio_44100.mp3" data = {"audio": []} for file_path, file_obj in iter_archive(tar_mp3_path): data["audio"].append({"path": file_path, "bytes": file_obj.read()}) break features = Features({"audio": Audio()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} assert item["audio"].keys() == {"path", "array", "sampling_rate"} assert item["audio"]["path"] == audio_filename assert item["audio"]["array"].shape == (109440, ) assert item["audio"]["sampling_rate"] == 44100 batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} assert batch["audio"][0]["path"] == audio_filename assert batch["audio"][0]["array"].shape == (109440, ) assert batch["audio"][0]["sampling_rate"] == 44100 column = dset["audio"] assert len(column) == 1 assert column[0].keys() == {"path", "array", "sampling_rate"} assert column[0]["path"] == audio_filename assert column[0]["array"].shape == (109440, ) assert column[0]["sampling_rate"] == 44100
def test_push_dataset_to_hub(self): local_ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: local_ds.push_to_hub(ds_name, split="train", token=self._token) local_ds_dict = {"train": local_ds} hub_ds_dict = load_dataset(ds_name, download_mode="force_redownload") self.assertListEqual(list(local_ds_dict.keys()), list(hub_ds_dict.keys())) for ds_split_name in local_ds_dict.keys(): local_ds = local_ds_dict[ds_split_name] hub_ds = hub_ds_dict[ds_split_name] self.assertListEqual(local_ds.column_names, hub_ds.column_names) self.assertListEqual(list(local_ds.features.keys()), list(hub_ds.features.keys())) self.assertDictEqual(local_ds.features, hub_ds.features) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset")
def test_push_dataset_dict_to_hub_custom_features(self): features = Features({ "x": Value("int64"), "y": ClassLabel(names=["neg", "pos"]) }) ds = Dataset.from_dict({ "x": [1, 2, 3], "y": [0, 0, 1] }, features=features) local_ds = DatasetDict({"test": ds}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") self.assertDictEqual(local_ds.column_names, hub_ds.column_names) self.assertListEqual(list(local_ds["test"].features.keys()), list(hub_ds["test"].features.keys())) self.assertDictEqual(local_ds["test"].features, hub_ds["test"].features) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset")
def get_dummy_dataset(self): dataset = Dataset.from_dict({ "id": ["0", "1"], "question": ["foo", "bar"], "answers": [["Foo", "Bar"], ["Bar"]], }) return dataset
def test_resampling_after_loading_dataset_with_audio_feature_mp3( shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.mp3") data = {"audio": [audio_path]} features = Features({"audio": Audio()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item["audio"]["sampling_rate"] == 44100 dset = dset.cast_column("audio", Audio(sampling_rate=16000)) item = dset[0] assert item.keys() == {"audio"} assert item["audio"].keys() == {"path", "array", "sampling_rate"} assert item["audio"]["path"] == audio_path assert item["audio"]["array"].shape == (39707, ) assert item["audio"]["sampling_rate"] == 16000 batch = dset[:1] assert batch.keys() == {"audio"} assert len(batch["audio"]) == 1 assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} assert batch["audio"][0]["path"] == audio_path assert batch["audio"][0]["array"].shape == (39707, ) assert batch["audio"][0]["sampling_rate"] == 16000 column = dset["audio"] assert len(column) == 1 assert column[0].keys() == {"path", "array", "sampling_rate"} assert column[0]["path"] == audio_path assert column[0]["array"].shape == (39707, ) assert column[0]["sampling_rate"] == 16000
def predict(cls, path_to_model, input): """For the input, do the predictions and return them. Args: input (a pandas dataframe): The data on which to do the predictions. There will be one prediction per row in the dataframe""" trainer, config, data_args, tokenizer = cls.get_model(path_to_model) text_column_name = "text" label_column_name = "labels" bbox_columns_name = "bbox" pred_dataset = Dataset.from_dict(input) tokenized_datasets = pred_dataset.map( lambda x: preprocess_dataset( x, tokenizer, config.label2id, data_args["label_all_tokens"], "max_length" if data_args["pad_to_max_length"] else False, data_args[ "use_bbox"], data_args["task_name"]), #remove_columns=[label_column_name], #todo: check if label_column_name in dataset before removing it batched=True, num_proc=data_args["preprocessing_num_workers"], load_from_cache_file=not data_args["overwrite_cache"], ) logger.info("Datasets %s", tokenized_datasets) logger.info("Column names %s", tokenized_datasets.column_names) logger.info("Sample example %s", tokenized_datasets[0]) # Get predictions true_predictions = None probas = None predictions, labels, _ = trainer.predict(tokenized_datasets, metric_key_prefix="pred") if data_args["task_name"] == "classif": true_predictions = [ config.id2label[p] for p in np.argmax(predictions, axis=1) ] probas = np.amax(softmax(predictions, axis=1), axis=1).tolist() elif data_args["task_name"] == "multilabel-classif": predictions = 1 / (1 + np.exp(-predictions)) # sigmoid predictions = (predictions > 0.5) # threshold true_predictions = [[ config.id2label[i] for i in np.where(p == 1)[0] ] for p in predictions] elif data_args["task_name"] == "regression": true_predictions = np.squeeze(predictions) elif data_args["task_name"] == "ner": predictions = np.argmax(predictions, axis=2) true_predictions = [[ config.id2label[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] logger.info("true_predictions %s", true_predictions) result = {} if true_predictions is not None: result["pred"] = true_predictions if probas is not None: result["proba"] = probas return result
def test_formatted_dataset_with_image_feature(shared_datadir): import PIL.Image image_path = str(shared_datadir / "test_image_rgb.jpg") data = {"image": [image_path, image_path]} features = Features({"image": Image()}) dset = Dataset.from_dict(data, features=features) with dset.formatted_as("numpy"): item = dset[0] assert item.keys() == {"image"} assert isinstance(item["image"], PIL.Image.Image) assert os.path.samefile(item["image"].filename, image_path) assert item["image"].format == "JPEG" assert item["image"].size == (640, 480) assert item["image"].mode == "RGB" batch = dset[:1] assert batch.keys() == {"image"} assert len(batch) == 1 assert isinstance(batch["image"], list) and all( isinstance(item, PIL.Image.Image) for item in batch["image"]) assert os.path.samefile(batch["image"][0].filename, image_path) assert batch["image"][0].format == "JPEG" assert batch["image"][0].size == (640, 480) assert batch["image"][0].mode == "RGB" column = dset["image"] assert len(column) == 2 assert isinstance(column, list) and all( isinstance(item, PIL.Image.Image) for item in column) assert os.path.samefile(column[0].filename, image_path) assert column[0].format == "JPEG" assert column[0].size == (640, 480) assert column[0].mode == "RGB" with dset.formatted_as("pandas"): item = dset[0] assert item.shape == (1, 1) assert item.columns == ["image"] assert isinstance(item["image"][0], PIL.Image.Image) assert os.path.samefile(item["image"][0].filename, image_path) assert item["image"][0].format == "JPEG" assert item["image"][0].size == (640, 480) assert item["image"][0].mode == "RGB" batch = dset[:1] assert batch.shape == (1, 1) assert batch.columns == ["image"] assert isinstance(batch["image"], pd.Series) and all( isinstance(item, PIL.Image.Image) for item in batch["image"]) assert os.path.samefile(batch["image"][0].filename, image_path) assert batch["image"][0].format == "JPEG" assert batch["image"][0].size == (640, 480) assert batch["image"][0].mode == "RGB" column = dset["image"] assert len(column) == 2 assert isinstance(column, pd.Series) and all( isinstance(item, PIL.Image.Image) for item in column) assert os.path.samefile(column[0].filename, image_path) assert column[0].format == "JPEG" assert column[0].size == (640, 480) assert column[0].mode == "RGB"
def get_wrong_dataset_TUWS(): labeled, _ = correct_examples() unlabeled = Dataset.from_dict({ 'sentence': ['moon what??.', 'I am people'], 'label': [-1, 0] }) train_dic = Dataset.from_dict({ 'sentence': labeled['sentence'] + unlabeled['sentence'], 'label': labeled['label'] + unlabeled['label'] }) return DatasetDict({'train': train_dic})
def add_chinese_references(dataset, ref_file): with open(ref_file, "r", encoding="utf-8") as f: refs = [json.loads(line) for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] assert len(dataset) == len(refs) dataset_dict = {c: dataset[c] for c in dataset.column_names} dataset_dict["chinese_ref"] = refs return Dataset.from_dict(dataset_dict)
def load_data( self, inputs: List[str], targets: Optional[List[Any]] = None, target_formatter: Optional[TargetFormatter] = None, ) -> Dataset: if targets is not None: hf_dataset = Dataset.from_dict({ DataKeys.INPUT: inputs, DataKeys.TARGET: targets }) else: hf_dataset = Dataset.from_dict({DataKeys.INPUT: inputs}) return super().load_data(hf_dataset, DataKeys.INPUT, DataKeys.TARGET, target_formatter=target_formatter)
def get_dataset(): data_dict = { "repo_name": ["test_repo1", "test_repo2", "test_repo3"], "path": ["test_1.py", "test_2.py", "unit_test.py"], "content": ["a " * 20, "a " * 30, "b " * 7], } dataset = Dataset.from_dict(data_dict) return dataset
def test_dataset_with_image_feature_map(shared_datadir): image_path = str(shared_datadir / "test_image_rgb.jpg") data = {"image": [image_path], "caption": ["cats sleeping"]} features = Features({"image": Image(), "caption": Value("string")}) dset = Dataset.from_dict(data, features=features) for item in dset: assert item.keys() == {"image", "caption"} assert item == { "image": { "path": image_path, "bytes": None }, "caption": "cats sleeping" } # no decoding def process_caption(example): example["caption"] = "Two " + example["caption"] return example processed_dset = dset.map(process_caption) for item in processed_dset: assert item.keys() == {"image", "caption"} assert item == { "image": { "path": image_path, "bytes": None }, "caption": "Two cats sleeping" } # decoding example def process_image_by_example(example): example["mode"] = example["image"].mode return example decoded_dset = dset.map(process_image_by_example) for item in decoded_dset: assert item.keys() == {"image", "caption", "mode"} assert os.path.samefile(item["image"]["path"], image_path) assert item["caption"] == "cats sleeping" assert item["mode"] == "RGB" # decoding batch def process_image_by_batch(batch): batch["mode"] = [image.mode for image in batch["image"]] return batch decoded_dset = dset.map(process_image_by_batch, batched=True) for item in decoded_dset: assert item.keys() == {"image", "caption", "mode"} assert os.path.samefile(item["image"]["path"], image_path) assert item["caption"] == "cats sleeping" assert item["mode"] == "RGB"
def test_dataset_concatenate_nested_image_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other image_path = str(shared_datadir / "test_image_rgb.jpg") features = Features({"list_of_structs_of_images": [{"image": Image()}]}) data1 = {"list_of_structs_of_images": [[{"image": image_path}]]} dset1 = Dataset.from_dict(data1, features=features) data2 = {"list_of_structs_of_images": [[{"image": {"bytes": open(image_path, "rb").read()}}]]} dset2 = Dataset.from_dict(data2, features=features) concatenated_dataset = concatenate_datasets([dset1, dset2]) assert len(concatenated_dataset) == len(dset1) + len(dset2) assert ( concatenated_dataset[0]["list_of_structs_of_images"][0]["image"] == dset1[0]["list_of_structs_of_images"][0]["image"] ) assert ( concatenated_dataset[1]["list_of_structs_of_images"][0]["image"] == dset2[0]["list_of_structs_of_images"][0]["image"] )
def test_dataset_cast_to_audio_features(shared_datadir, build_data): audio_path = str(shared_datadir / "test_audio_44100.wav") data = build_data(audio_path) dset = Dataset.from_dict(data) item = dset.cast(Features({"audio": Audio()}))[0] assert item.keys() == {"audio"} assert item["audio"].keys() == {"path", "array", "sampling_rate"} item = dset.cast_column("audio", Audio())[0] assert item.keys() == {"audio"} assert item["audio"].keys() == {"path", "array", "sampling_rate"}
def load_dataset_from_disk(args): datas = DatasetDict() if args.train_file is not None and args.train: train_json = json.load(open(args.train_file)) train_json, dev_json = shuffle_data(train_json, args.split) datas["train"] = Dataset.from_dict(convert_data_structure(train_json)) datas["dev"] = Dataset.from_dict(convert_data_structure(dev_json)) logger.info(f"Load train data number: {datas['train'].num_rows}") logger.info(f"Load dev data number: {datas['dev'].num_rows}") if args.eval and not args.train: train_json = json.load(open(args.train_file)) train_json, dev_json = shuffle_data(train_json, args.split) datas['dev'] = Dataset.from_dict(convert_data_structure(dev_json)) logger.info(f"Load dev data number: {datas['dev'].num_rows}") if args.predict_file is not None and args.predict: predict_json = json.load(open(args.predict_file)) datas["predict"] = Dataset.from_dict(convert_data_structure(predict_json, ispredict=True)) logger.info(f"Load predict data number: {datas['predict'].num_rows}") return datas
def get_dummy_dataset(self): dataset = Dataset.from_dict( { "id": ["0", "1"], "text": ["foo", "bar"], "title": ["Foo", "Bar"], "embeddings": [np.ones(self.retrieval_vector_size), 2 * np.ones(self.retrieval_vector_size)], } ) dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT) return dataset
def load_data(self, data: str, dataset: Optional[Any] = None) -> "datasets.Dataset": stage = self._running_stage.value file_path = data path = Path(file_path) with open(path, "rb") as f: squad_v_2_dict = json.load(f) ids = [] titles = [] contexts = [] questions = [] answers = [] for topic in squad_v_2_dict["data"]: title = topic["title"] for comprehension in topic["paragraphs"]: context = comprehension["context"] for qa in comprehension["qas"]: question = qa["question"] id = qa["id"] _answer_starts = [ answer["answer_start"] for answer in qa["answers"] ] _answers = [answer["text"] for answer in qa["answers"]] ids.append(id) titles.append(title) contexts.append(context) questions.append(question) answers.append( dict(text=_answers, answer_start=_answer_starts)) dataset_dict = DatasetDict({ stage: Dataset.from_dict({ "id": ids, "title": titles, "context": contexts, "question": questions, "answer": answers }) }) column_names = dataset_dict[stage].column_names dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True, remove_columns=column_names) return dataset_dict[stage]
def test_dataset_cast_to_image_features(shared_datadir, build_data): import PIL.Image image_path = str(shared_datadir / "test_image_rgb.jpg") data = build_data(image_path) dset = Dataset.from_dict(data) item = dset.cast(Features({"image": Image()}))[0] assert item.keys() == {"image"} assert isinstance(item["image"], PIL.Image.Image) item = dset.cast_column("image", Image())[0] assert item.keys() == {"image"} assert isinstance(item["image"], PIL.Image.Image)
def test_dataset_concatenate_nested_audio_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other audio_path = str(shared_datadir / "test_audio_44100.wav") features = Features({"list_of_structs_of_audios": [{"audio": Audio()}]}) data1 = {"list_of_structs_of_audios": [[{"audio": audio_path}]]} dset1 = Dataset.from_dict(data1, features=features) data2 = { "list_of_structs_of_audios": [[{ "audio": { "bytes": open(audio_path, "rb").read() } }]] } dset2 = Dataset.from_dict(data2, features=features) concatenated_dataset = concatenate_datasets([dset1, dset2]) assert len(concatenated_dataset) == len(dset1) + len(dset2) assert (concatenated_dataset[0]["list_of_structs_of_audios"][0]["audio"] ["array"].shape == dset1[0]["list_of_structs_of_audios"][0] ["audio"]["array"].shape) assert (concatenated_dataset[1]["list_of_structs_of_audios"][0]["audio"] ["array"].shape == dset2[0]["list_of_structs_of_audios"][0] ["audio"]["array"].shape)
def load_data( self, data: Dict[str, Any], question_column_name: str = "question", context_column_name: str = "context", answer_column_name: str = "answer", ) -> Dataset: return super().load_data( Dataset.from_dict(data), question_column_name=question_column_name, context_column_name=context_column_name, answer_column_name=answer_column_name, )
def test_enable_disable_progress_bar(): dset = Dataset.from_dict({"col_1": [3, 2, 0, 1]}) with patch("tqdm.auto.tqdm") as mock_tqdm: datasets.disable_progress_bar() dset.map(lambda x: {"col_2": x["col_1"] + 1}) mock_tqdm.assert_not_called() mock_tqdm.reset_mock() datasets.enable_progress_bar() dset.map(lambda x: {"col_2": x["col_1"] + 1}) mock_tqdm.assert_called()
def test_push_dataset_to_hub_custom_splits(self): ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: ds.push_to_hub(ds_name, split="random", token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") self.assertListEqual(ds.column_names, hub_ds["random"].column_names) self.assertListEqual(list(ds.features.keys()), list(hub_ds["random"].features.keys())) self.assertDictEqual(ds.features, hub_ds["random"].features) finally: self.cleanup_repo(ds_name)
def load_data(self, data: Any, columns: List[str] = None) -> "datasets.Dataset": stage = self._running_stage.value dataset_dict = DatasetDict({stage: Dataset.from_dict(data)}) column_names = dataset_dict[stage].column_names dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True, remove_columns=column_names) return dataset_dict[stage]
def get_correct_dataset_TUWS(wrong_key=False): labeled, unlabeled = correct_examples() train_dic = Dataset.from_dict({ 'sentence': labeled['sentence'] + unlabeled['sentence'], 'label': labeled['label'] + unlabeled['label'] }) if wrong_key is False: return DatasetDict({'train': train_dic}) else: return DatasetDict({'training_Data': train_dic})
def test_dataset_with_image_feature_undecoded(shared_datadir): image_path = str(shared_datadir / "test_image_rgb.jpg") data = {"image": [image_path]} features = Features({"image": Image(decode=False)}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"image"} assert item["image"] == {"path": image_path, "bytes": None} batch = dset[:1] assert batch.keys() == {"image"} assert len(batch["image"]) == 1 assert batch["image"][0] == {"path": image_path, "bytes": None} column = dset["image"] assert len(column) == 1 assert column[0] == {"path": image_path, "bytes": None}
def test_push_dataset_to_hub_custom_features(self): features = Features({"x": Value("int64"), "y": ClassLabel(names=["neg", "pos"])}) ds = Dataset.from_dict({"x": [1, 2, 3], "y": [0, 0, 1]}, features=features) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") self.assertListEqual(ds.column_names, hub_ds.column_names) self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys())) self.assertDictEqual(ds.features, hub_ds.features) self.assertEqual(ds[:], hub_ds[:]) finally: self.cleanup_repo(ds_name)
def test_dataset_with_image_feature_map_undecoded(shared_datadir): image_path = str(shared_datadir / "test_image_rgb.jpg") data = {"image": [image_path]} features = Features({"image": Image(decode=False)}) dset = Dataset.from_dict(data, features=features) def assert_image_example_undecoded(example): assert example["image"] == {"path": image_path, "bytes": None} dset.map(assert_image_example_undecoded) def assert_image_batch_undecoded(batch): for image in batch["image"]: assert image == {"path": image_path, "bytes": None} dset.map(assert_image_batch_undecoded, batched=True)