Exemplo n.º 1
0
    def test_remove_and_map_on_task_template(self):
        features = Features({"text": Value("string"), "label": ClassLabel(names=("pos", "neg"))})
        task_templates = TextClassification(text_column="text", label_column="label")
        info = DatasetInfo(features=features, task_templates=task_templates)
        dataset = Dataset.from_dict({"text": ["A sentence."], "label": ["pos"]}, info=info)

        def process(example):
            return example

        modified_dataset = dataset.remove_columns("label")
        mapped_dataset = modified_dataset.map(process)
        assert mapped_dataset.info.task_templates == []
Exemplo n.º 2
0
 def test_from_dict(self):
     input_schema = Features({"image_file_path": Value("string")})
     label_schema = Features(
         {"labels": ClassLabel(names=tuple(self.labels))})
     template_dict = {
         "image_file_path_column": "input_image_file_path",
         "label_column": "input_label",
         "labels": self.labels,
     }
     task = ImageClassification.from_dict(template_dict)
     self.assertEqual("image-classification", task.task)
     self.assertEqual(input_schema, task.input_schema)
     self.assertEqual(label_schema, task.label_schema)
Exemplo n.º 3
0
 def test_from_dict(self):
     input_schema = Features({"text": Value("string")})
     # Labels are cast to tuple during `TextClassification.__post_init__`, so we do the same here
     label_schema = Features(
         {"labels": ClassLabel(names=tuple(self.labels))})
     template_dict = {
         "text_column": "input_text",
         "label_column": "input_labels",
         "labels": self.labels
     }
     task = TextClassification.from_dict(template_dict)
     self.assertEqual("text-classification", task.task)
     self.assertEqual(input_schema, task.input_schema)
     self.assertEqual(label_schema, task.label_schema)
def test_interleave_datasets_with_features(dataset: IterableDataset, generate_examples_fn):
    features = Features(
        {
            "id": Value("int64"),
            "label": ClassLabel(names=["negative", "positive"]),
        }
    )
    ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 0})
    dataset_with_features = IterableDataset(ex_iterable, info=DatasetInfo(features=features))

    merged_dataset = interleave_datasets([dataset, dataset_with_features], probabilities=[0, 1])
    assert isinstance(merged_dataset._ex_iterable, CyclingMultiSourcesExamplesIterable)
    assert isinstance(merged_dataset._ex_iterable.ex_iterables[1], TypedExamplesIterable)
    assert merged_dataset._ex_iterable.ex_iterables[1].features == features
    assert next(iter(merged_dataset)) == next(iter(dataset_with_features))
def test_encode_batch_with_example_with_empty_first_elem():
    features = Features(
        {
            "x": Sequence(Sequence(ClassLabel(names=["a", "b"]))),
        }
    )
    encoded_batch = features.encode_batch(
        {
            "x": [
                [["a"], ["b"]],
                [[], ["b"]],
            ]
        }
    )
    assert encoded_batch == {"x": [[[0], [1]], [[], [1]]]}
Exemplo n.º 6
0
def test_write_with_features():
    output = pa.BufferOutputStream()
    features = Features({"labels": ClassLabel(names=["neg", "pos"])})
    with ArrowWriter(stream=output, features=features) as writer:
        writer.write({"labels": 0})
        writer.write({"labels": 1})
        num_examples, num_bytes = writer.finalize()
    assert num_examples == 2
    assert num_bytes > 0
    assert writer._schema == features.arrow_schema
    assert writer._schema.metadata == features.arrow_schema.metadata
    stream = pa.BufferReader(output.getvalue())
    f = pa.ipc.open_stream(stream)
    pa_table: pa.Table = f.read_all()
    schema = pa_table.schema
    assert pa_table.num_rows == 2
    assert schema == features.arrow_schema
    assert schema.metadata == features.arrow_schema.metadata
    assert features == Features.from_arrow_schema(schema)
Exemplo n.º 7
0
 def __init__(self):
     super(BinarySentiment, self).__init__(
         num_classes=2,
         input_schema=Schema(
             features=OrderedDict([
                 ("text", Value(dtype="string")),
             ]),
             grounding_candidates={
                 "text": {"text", "sentence"},
             },
         ),
         output_schema=Schema(
             features=OrderedDict([
                 ("label", ClassLabel(names=["negative", "positive"])),
             ]),
             grounding_candidates={
                 "label": {"label"},
             },
         ),
         identifier=self.__class__.__name__,
     )
Exemplo n.º 8
0
def test_classlabel_init(tmp_path_factory):
    names = ["negative", "positive"]
    names_file = str(tmp_path_factory.mktemp("features") / "labels.txt")
    with open(names_file, "w", encoding="utf-8") as f:
        f.write("\n".join(names))
    classlabel = ClassLabel(names=names)
    assert classlabel.names == names and classlabel.num_classes == len(names)
    classlabel = ClassLabel(names_file=names_file)
    assert classlabel.names == names and classlabel.num_classes == len(names)
    classlabel = ClassLabel(num_classes=len(names), names=names)
    assert classlabel.names == names and classlabel.num_classes == len(names)
    classlabel = ClassLabel(num_classes=len(names))
    assert classlabel.names == [str(i) for i in range(len(names))] and classlabel.num_classes == len(names)
    with pytest.raises(ValueError):
        classlabel = ClassLabel(num_classes=len(names) + 1, names=names)
    with pytest.raises(ValueError):
        classlabel = ClassLabel(names=names, names_file=names_file)
    with pytest.raises(ValueError):
        classlabel = ClassLabel()
Exemplo n.º 9
0
    def test_features(self):
        n_rows = 10
        n_cols = 3

        def get_features(type):
            return Features({str(i): type for i in range(n_cols)})

        with tempfile.TemporaryDirectory() as tmp_dir:
            open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write(
                "\n".join(",".join([str(i) for i in range(n_cols)]) for _ in range(n_rows + 1))
            )
            for type in [Value("float64"), Value("int8"), ClassLabel(num_classes=n_cols)]:
                features = get_features(type)
                ds = load_dataset(
                    "csv",
                    data_files=os.path.join(tmp_dir, "table.csv"),
                    cache_dir=tmp_dir,
                    split="train",
                    features=features,
                )
                self.assertEqual(len(ds), n_rows)
                self.assertDictEqual(ds.features, features)
                del ds
Exemplo n.º 10
0
def dataset():
    n = 10
    features = Features(
        {
            "tokens": Sequence(Value("string")),
            "labels": Sequence(ClassLabel(names=["negative", "positive"])),
            "answers": Sequence(
                {
                    "text": Value("string"),
                    "answer_start": Value("int32"),
                }
            ),
        }
    )
    dataset = Dataset.from_dict(
        {
            "tokens": [["foo"] * 5] * n,
            "labels": [[1] * 5] * n,
            "answers": [{"answer_start": [97], "text": ["1976"]}] * 10,
        },
        features=features,
    )
    return dataset
Exemplo n.º 11
0
 def __init__(self):
     super(BinaryNaturalLanguageInference, self).__init__(
         num_classes=2,
         input_schema=Schema(
             features=OrderedDict([
                 ("premise", Value(dtype="string")),
                 ("hypothesis", Value(dtype="string")),
             ]),
             grounding_candidates={
                 "premise": {"premise", "sentence1"},
                 "hypothesis": {"hypothesis", "sentence2"},
             },
         ),
         output_schema=Schema(
             features=OrderedDict([
                 ("label",
                  ClassLabel(names=["entailment", "non entailment"])),
             ]),
             grounding_candidates={
                 "label": {"label"},
             },
         ),
         identifier=self.__class__.__name__,
     )

@pytest.mark.parametrize(
    "features",
    [
        None,
        Features(
            {
                "id": Value("int64"),
                "label": Value("int64"),
            }
        ),
        Features(
            {
                "id": Value("int64"),
                "label": ClassLabel(names=["negative", "positive"]),
            }
        ),
    ],
)
def test_iterable_dataset_features(generate_examples_fn, features):
    ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 0})
    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features))
    if features:
        expected = [features.encode_example(x) for _, x in ex_iterable]
    else:
        expected = [x for _, x in ex_iterable]
    assert list(dataset) == expected


@require_torch
Exemplo n.º 13
0
    features = Features({
        "x": Sequence(Sequence(ClassLabel(names=["a", "b"]))),
    })
    encoded_batch = features.encode_batch(
        {"x": [
            [["a"], ["b"]],
            [[], ["b"]],
        ]})
    assert encoded_batch == {"x": [[[0], [1]], [[], [1]]]}


@pytest.mark.parametrize(
    "feature",
    [
        Value("int32"),
        ClassLabel(num_classes=2),
        Translation(languages=["en", "fr"]),
        TranslationVariableLanguages(languages=["en", "fr"]),
    ],
)
def test_dataset_feature_with_none(feature):
    data = {"col": [None]}
    features = Features({"col": feature})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"col"}
    assert item["col"] is None
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"col"}
    assert isinstance(batch["col"], list) and all(item is None
Exemplo n.º 14
0
 def test_align_with_features(self):
     task = TextClassification(text_column="input_text", label_column="input_label")
     self.assertEqual(task.label_schema["labels"], ClassLabel)
     task = task.align_with_features(Features({"input_label": ClassLabel(names=self.labels)}))
     self.assertEqual(task.label_schema["labels"], ClassLabel(names=self.labels))
Exemplo n.º 15
0
def dataset():
    features = Features(
        {"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"]))}
    )
    dataset = Dataset.from_dict({"tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10}, features=features)
    return dataset
Exemplo n.º 16
0
    def test_reorder_fields_as(self):
        features = Features(
            {
                "id": Value("string"),
                "document": {
                    "title": Value("string"),
                    "url": Value("string"),
                    "html": Value("string"),
                    "tokens": Sequence({"token": Value("string"), "is_html": Value("bool")}),
                },
                "question": {
                    "text": Value("string"),
                    "tokens": Sequence(Value("string")),
                },
                "annotations": Sequence(
                    {
                        "id": Value("string"),
                        "long_answer": {
                            "start_token": Value("int64"),
                            "end_token": Value("int64"),
                            "start_byte": Value("int64"),
                            "end_byte": Value("int64"),
                        },
                        "short_answers": Sequence(
                            {
                                "start_token": Value("int64"),
                                "end_token": Value("int64"),
                                "start_byte": Value("int64"),
                                "end_byte": Value("int64"),
                                "text": Value("string"),
                            }
                        ),
                        "yes_no_answer": ClassLabel(names=["NO", "YES"]),
                    }
                ),
            }
        )

        other = Features(  # same but with [] instead of sequences, and with a shuffled fields order
            {
                "id": Value("string"),
                "document": {
                    "tokens": Sequence({"token": Value("string"), "is_html": Value("bool")}),
                    "title": Value("string"),
                    "url": Value("string"),
                    "html": Value("string"),
                },
                "question": {
                    "text": Value("string"),
                    "tokens": [Value("string")],
                },
                "annotations": {
                    "yes_no_answer": [ClassLabel(names=["NO", "YES"])],
                    "id": [Value("string")],
                    "long_answer": [
                        {
                            "end_byte": Value("int64"),
                            "start_token": Value("int64"),
                            "end_token": Value("int64"),
                            "start_byte": Value("int64"),
                        }
                    ],
                    "short_answers": [
                        Sequence(
                            {
                                "text": Value("string"),
                                "start_token": Value("int64"),
                                "end_token": Value("int64"),
                                "start_byte": Value("int64"),
                                "end_byte": Value("int64"),
                            }
                        )
                    ],
                },
            }
        )

        expected = Features(
            {
                "id": Value("string"),
                "document": {
                    "tokens": Sequence({"token": Value("string"), "is_html": Value("bool")}),
                    "title": Value("string"),
                    "url": Value("string"),
                    "html": Value("string"),
                },
                "question": {
                    "text": Value("string"),
                    "tokens": Sequence(Value("string")),
                },
                "annotations": Sequence(
                    {
                        "yes_no_answer": ClassLabel(names=["NO", "YES"]),
                        "id": Value("string"),
                        "long_answer": {
                            "end_byte": Value("int64"),
                            "start_token": Value("int64"),
                            "end_token": Value("int64"),
                            "start_byte": Value("int64"),
                        },
                        "short_answers": Sequence(
                            {
                                "text": Value("string"),
                                "start_token": Value("int64"),
                                "end_token": Value("int64"),
                                "start_byte": Value("int64"),
                                "end_byte": Value("int64"),
                            }
                        ),
                    }
                ),
            }
        )

        reordered_features = features.reorder_fields_as(other)
        self.assertDictEqual(reordered_features, expected)
        self.assertEqual(reordered_features.type, other.type)
        self.assertEqual(reordered_features.type, expected.type)
        self.assertNotEqual(reordered_features.type, features.type)
Exemplo n.º 17
0
# pre_trained_model_name = 'roberta-base'
logger.critical("Build pre-trained model {}".format(pre_trained_model_name))
base_pre_trained_model_path = '/home/ubuntu/likun/nlp_pretrained/{}'.format(
    pre_trained_model_name)
# trained_model_path = '/home/ubuntu/likun/nlp_save_kernels/zero-shot-metric-learning-benchmark-topic-small'
# tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
tokenizer = BertTokenizer.from_pretrained(base_pre_trained_model_path)

from datasets.features import ClassLabel
from datasets.features import Features
yahoo_zsl_path = '/home/ubuntu/likun/nlp_data/zsl/BenchmarkingZeroShot/topic_yahoo'
fea = Features({
    "text":
    datasets.Value("string"),
    "label":
    ClassLabel(names_file=os.path.join(yahoo_zsl_path, 'classes.txt'))
})

download_config = datasets.DownloadConfig()
download_config.max_retries = 20
dataset = datasets.load_dataset('csv',
                                data_files={
                                    'train':
                                    os.path.join(yahoo_zsl_path,
                                                 'train_half_v0.csv'),
                                    'test':
                                    os.path.join(yahoo_zsl_path, 'test.csv')
                                },
                                features=fea,
                                download_config=download_config,
                                ignore_verifications=True)