def test_dataset_feature_with_none(feature): data = {"col": [None]} features = Features({"col": feature}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"col"} assert item["col"] is None batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"col"} assert isinstance(batch["col"], list) and all(item is None for item in batch["col"]) column = dset["col"] assert len(column) == 1 assert isinstance(column, list) and all(item is None for item in column) # nested tests data = {"col": [[None]]} features = Features({"col": Sequence(feature)}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"col"} assert all(i is None for i in item["col"]) data = {"nested": [{"col": None}]} features = Features({"nested": {"col": feature}}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"nested"} assert item["nested"].keys() == {"col"} assert item["nested"]["col"] is None
def test_from_arrow_schema_with_sequence(self): data = {"a": [{"b": {"c": ["text"]}}] * 10, "foo": [1] * 10} original_features = Features({"a": {"b": Sequence({"c": Value("string")})}, "foo": Value("int64")}) dset = Dataset.from_dict(data, features=original_features) new_features = dset.features new_dset = Dataset.from_dict(data, features=new_features) self.assertEqual(original_features.type, new_features.type) self.assertDictEqual(dset[0], new_dset[0]) self.assertDictEqual(dset[:], new_dset[:])
def _create_dummy_dataset(self, multiple_columns=False): if multiple_columns: data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} dset = Dataset.from_dict(data) else: dset = Dataset.from_dict( {"filename": ["my_name-train" + "_" + "{:03d}".format(x) for x in np.arange(30).tolist()]} ) return dset
def test_align_labels_with_mapping(self): train_features = Features({ "input_text": Value("string"), "input_labels": ClassLabel(num_classes=3, names=["entailment", "neutral", "contradiction"]), }) test_features = Features({ "input_text": Value("string"), "input_labels": ClassLabel(num_classes=3, names=["entailment", "contradiction", "neutral"]), }) train_data = { "input_text": ["a", "a", "b", "b", "c", "c"], "input_labels": [0, 0, 1, 1, 2, 2] } test_data = { "input_text": ["a", "a", "c", "c", "b", "b"], "input_labels": [0, 0, 1, 1, 2, 2] } label2id = {"CONTRADICTION": 0, "ENTAILMENT": 2, "NEUTRAL": 1} id2label = {v: k for k, v in label2id.items()} train_expected_labels = [2, 2, 1, 1, 0, 0] test_expected_labels = [2, 2, 0, 0, 1, 1] train_expected_label_names = [ id2label[idx] for idx in train_expected_labels ] test_expected_label_names = [ id2label[idx] for idx in test_expected_labels ] dsets = DatasetDict({ "train": Dataset.from_dict(train_data, features=train_features), "test": Dataset.from_dict(test_data, features=test_features), }) dsets = dsets.align_labels_with_mapping(label2id, "input_labels") self.assertListEqual(train_expected_labels, dsets["train"]["input_labels"]) self.assertListEqual(test_expected_labels, dsets["test"]["input_labels"]) train_aligned_label_names = [ dsets["train"].features["input_labels"].int2str(idx) for idx in dsets["train"]["input_labels"] ] test_aligned_label_names = [ dsets["test"].features["input_labels"].int2str(idx) for idx in dsets["test"]["input_labels"] ] self.assertListEqual(train_expected_label_names, train_aligned_label_names) self.assertListEqual(test_expected_label_names, test_aligned_label_names)
def test_flatten(self): dset_split = Dataset.from_dict( { "a": [{ "b": { "c": ["text"] } }] * 10, "foo": [1] * 10 }, features=Features({ "a": { "b": Sequence({"c": Value("string")}) }, "foo": Value("int64") }), ) dset = DatasetDict({"train": dset_split, "test": dset_split}) dset = dset.flatten() self.assertDictEqual(dset.column_names, { "train": ["a.b.c", "foo"], "test": ["a.b.c", "foo"] }) self.assertListEqual(sorted(dset["train"].features.keys()), ["a.b.c", "foo"]) self.assertDictEqual( dset["train"].features, Features({ "a.b.c": Sequence(Value("string")), "foo": Value("int64") })) del dset
def dataset(): n = 10 features = Features({ "tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"])), "answers": Sequence({ "text": Value("string"), "answer_start": Value("int32"), }), "id": Value("int64"), }) dataset = Dataset.from_dict( { "tokens": [["foo"] * 5] * n, "labels": [[1] * 5] * n, "answers": [{ "answer_start": [97], "text": ["1976"] }] * 10, "id": list(range(n)), }, features=features, ) return dataset
def test_map_on_task_template(self): info = DatasetInfo(task_templates=QuestionAnsweringExtractive()) dataset = Dataset.from_dict({k: [v] for k, v in SAMPLE_QUESTION_ANSWERING_EXTRACTIVE.items()}, info=info) assert isinstance(dataset.info.task_templates, list) assert len(dataset.info.task_templates) == 1 def keep_task(x): return x def dont_keep_task(x): out = deepcopy(SAMPLE_QUESTION_ANSWERING_EXTRACTIVE) out["answers"]["foobar"] = 0 return out mapped_dataset = dataset.map(keep_task) assert mapped_dataset.info.task_templates == dataset.info.task_templates # reload from cache mapped_dataset = dataset.map(keep_task) assert mapped_dataset.info.task_templates == dataset.info.task_templates mapped_dataset = dataset.map(dont_keep_task) assert mapped_dataset.info.task_templates == [] # reload from cache mapped_dataset = dataset.map(dont_keep_task) assert mapped_dataset.info.task_templates == []
def convert_glue_samples_to_hf_dataset(samples, glue_labels, is_test=False): idx_list = [] text_a_list = [] text_b_list = [] labels_list = [] for guid, text_a, text_b, label in tqdm(samples): idx_list.append(guid) text_a_list.append(text_a) text_b_list.append(text_b) if not is_test: labels_list.append(label) data_dict = { 'idx': idx_list, 'sentence1': text_a_list, } if any(text_b_list): data_dict['sentence2'] = text_b_list if not is_test: check_labels(glue_labels, labels_list) data_dict['label'] = labels_list dataset = ArrowDataset.from_dict(data_dict) return dataset
def test_remove_and_map_on_task_template(self): features = Features({"text": Value("string"), "label": ClassLabel(names=("pos", "neg"))}) task_templates = TextClassification(text_column="text", label_column="label") info = DatasetInfo(features=features, task_templates=task_templates) dataset = Dataset.from_dict({"text": ["A sentence."], "label": ["pos"]}, info=info) def process(example): return example modified_dataset = dataset.remove_columns("label") mapped_dataset = modified_dataset.map(process) assert mapped_dataset.info.task_templates == []
def arrow_path(tmp_path_factory): dataset = Dataset.from_dict(DATA_DICT_OF_LISTS) path = str(tmp_path_factory.mktemp("data") / "dataset.arrow") dataset.map(cache_file_name=path) return path
def _create_dummy_dataset(self): dset = Dataset.from_dict({"filename": ["my_name-train" + "_" + str(x) for x in np.arange(30).tolist()]}) return dset
def dataset(): features = Features( {"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"]))} ) dataset = Dataset.from_dict({"tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10}, features=features) return dataset