def test_dataset_with_audio_feature_with_none():
    data = {"audio": [None]}
    features = Features({"audio": Audio()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"audio"}
    assert item["audio"] is None
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"audio"}
    assert isinstance(batch["audio"], list) and all(item is None
                                                    for item in batch["audio"])
    column = dset["audio"]
    assert len(column) == 1
    assert isinstance(column, list) and all(item is None for item in column)

    # nested tests

    data = {"audio": [[None]]}
    features = Features({"audio": Sequence(Audio())})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"audio"}
    assert all(i is None for i in item["audio"])

    data = {"nested": [{"audio": None}]}
    features = Features({"nested": {"audio": Audio()}})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"nested"}
    assert item["nested"].keys() == {"audio"}
    assert item["nested"]["audio"] is None
示例#2
0
def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.mp3")
    data = {"audio": [audio_path]}
    features = Features({"audio": Audio()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item["audio"]["sampling_rate"] == 44100
    dset = dset.cast_column("audio", Audio(sampling_rate=16000))
    item = dset[0]
    assert item.keys() == {"audio"}
    assert item["audio"].keys() == {"path", "array", "sampling_rate"}
    assert item["audio"]["path"] == audio_path
    assert item["audio"]["array"].shape == (39707,)
    assert item["audio"]["sampling_rate"] == 16000
    batch = dset[:1]
    assert batch.keys() == {"audio"}
    assert len(batch["audio"]) == 1
    assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"}
    assert batch["audio"][0]["path"] == audio_path
    assert batch["audio"][0]["array"].shape == (39707,)
    assert batch["audio"][0]["sampling_rate"] == 16000
    column = dset["audio"]
    assert len(column) == 1
    assert column[0].keys() == {"path", "array", "sampling_rate"}
    assert column[0]["path"] == audio_path
    assert column[0]["array"].shape == (39707,)
    assert column[0]["sampling_rate"] == 16000
示例#3
0
def test_audio_resampling(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    audio = Audio(sampling_rate=16000)
    decoded_example = audio.decode_example(audio_path)
    assert decoded_example.keys() == {"path", "array", "sampling_rate"}
    assert decoded_example["path"] == audio_path
    assert decoded_example["array"].shape == (73401,)
    assert decoded_example["sampling_rate"] == 16000
def test_audio_decode_example_opus(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_48000.opus")
    audio = Audio()
    decoded_example = audio.decode_example(audio.encode_example(audio_path))
    assert decoded_example.keys() == {"path", "array", "sampling_rate"}
    assert decoded_example["path"] == audio_path
    assert decoded_example["array"].shape == (48000, )
    assert decoded_example["sampling_rate"] == 48000
示例#5
0
def test_audio_decode_example_mp3(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.mp3")
    audio = Audio()
    decoded_example = audio.decode_example(audio_path)
    assert decoded_example.keys() == {"path", "array", "sampling_rate"}
    assert decoded_example["path"] == audio_path
    assert decoded_example["array"].shape == (109440,)
    assert decoded_example["sampling_rate"] == 44100
def test_audio_feature_encode_example(shared_datadir, build_example):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    audio = Audio()
    encoded_example = audio.encode_example(build_example(audio_path))
    assert isinstance(encoded_example, dict)
    assert encoded_example.keys() == {"bytes", "path"}
    assert encoded_example["bytes"] is not None or encoded_example[
        "path"] is not None
    decoded_example = audio.decode_example(encoded_example)
    assert decoded_example.keys() == {"path", "array", "sampling_rate"}
def test_dataset_cast_to_audio_features(shared_datadir, build_data):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data = build_data(audio_path)
    dset = Dataset.from_dict(data)
    item = dset.cast(Features({"audio": Audio()}))[0]
    assert item.keys() == {"audio"}
    assert item["audio"].keys() == {"path", "array", "sampling_rate"}
    item = dset.cast_column("audio", Audio())[0]
    assert item.keys() == {"audio"}
    assert item["audio"].keys() == {"path", "array", "sampling_rate"}
def test_audio_decode_example(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    audio = Audio()
    decoded_example = audio.decode_example(audio.encode_example(audio_path))
    assert decoded_example.keys() == {"path", "array", "sampling_rate"}
    assert decoded_example["path"] == audio_path
    assert decoded_example["array"].shape == (202311, )
    assert decoded_example["sampling_rate"] == 44100

    with pytest.raises(RuntimeError):
        Audio(decode=False).decode_example(audio_path)
def test_dataset_concatenate_audio_features(shared_datadir):
    # we use a different data structure between 1 and 2 to make sure they are compatible with each other
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data1 = {"audio": [audio_path]}
    dset1 = Dataset.from_dict(data1, features=Features({"audio": Audio()}))
    data2 = {"audio": [{"bytes": open(audio_path, "rb").read()}]}
    dset2 = Dataset.from_dict(data2, features=Features({"audio": Audio()}))
    concatenated_dataset = concatenate_datasets([dset1, dset2])
    assert len(concatenated_dataset) == len(dset1) + len(dset2)
    assert concatenated_dataset[0]["audio"]["array"].shape == dset1[0][
        "audio"]["array"].shape
    assert concatenated_dataset[1]["audio"]["array"].shape == dset2[0][
        "audio"]["array"].shape
示例#10
0
def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path):
    audio_filename = "test_audio_44100.mp3"
    data = {"audio": []}
    for file_path, file_obj in iter_archive(tar_mp3_path):
        data["audio"].append({"path": file_path, "bytes": file_obj.read()})
        break
    features = Features({"audio": Audio()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"audio"}
    assert item["audio"].keys() == {"path", "array", "sampling_rate"}
    assert item["audio"]["path"] == audio_filename
    assert item["audio"]["array"].shape == (109440, )
    assert item["audio"]["sampling_rate"] == 44100
    batch = dset[:1]
    assert batch.keys() == {"audio"}
    assert len(batch["audio"]) == 1
    assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"}
    assert batch["audio"][0]["path"] == audio_filename
    assert batch["audio"][0]["array"].shape == (109440, )
    assert batch["audio"][0]["sampling_rate"] == 44100
    column = dset["audio"]
    assert len(column) == 1
    assert column[0].keys() == {"path", "array", "sampling_rate"}
    assert column[0]["path"] == audio_filename
    assert column[0]["array"].shape == (109440, )
    assert column[0]["sampling_rate"] == 44100
def test_formatted_dataset_with_audio_feature_undecoded(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data = {"audio": [audio_path]}
    features = Features({"audio": Audio(decode=False)})
    dset = Dataset.from_dict(data, features=features)
    with dset.formatted_as("numpy"):
        item = dset[0]
        assert item.keys() == {"audio"}
        assert item["audio"] == {"path": audio_path, "bytes": None}
        batch = dset[:1]
        assert batch.keys() == {"audio"}
        assert len(batch["audio"]) == 1
        assert batch["audio"][0] == {"path": audio_path, "bytes": None}
        column = dset["audio"]
        assert len(column) == 1
        assert column[0] == {"path": audio_path, "bytes": None}

    with dset.formatted_as("pandas"):
        item = dset[0]
        assert item.shape == (1, 1)
        assert item.columns == ["audio"]
        assert item["audio"][0] == {"path": audio_path, "bytes": None}
        batch = dset[:1]
        assert batch.shape == (1, 1)
        assert batch.columns == ["audio"]
        assert batch["audio"][0] == {"path": audio_path, "bytes": None}
        column = dset["audio"]
        assert len(column) == 1
        assert column[0] == {"path": audio_path, "bytes": None}
示例#12
0
def test_dataset_with_audio_feature_map_is_decoded(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data = {"audio": [audio_path], "text": ["Hello"]}
    features = Features({"audio": Audio(), "text": Value("string")})
    dset = Dataset.from_dict(data, features=features)

    def process_audio_sampling_rate_by_example(example):
        example["double_sampling_rate"] = 2 * example["audio"]["sampling_rate"]
        return example

    decoded_dset = dset.map(process_audio_sampling_rate_by_example)
    for item in decoded_dset:
        assert item.keys() == {"audio", "text", "double_sampling_rate"}
        assert item["double_sampling_rate"] == 88200

    def process_audio_sampling_rate_by_batch(batch):
        double_sampling_rates = []
        for audio in batch["audio"]:
            double_sampling_rates.append(2 * audio["sampling_rate"])
        batch["double_sampling_rate"] = double_sampling_rates
        return batch

    decoded_dset = dset.map(process_audio_sampling_rate_by_batch, batched=True)
    for item in decoded_dset:
        assert item.keys() == {"audio", "text", "double_sampling_rate"}
        assert item["double_sampling_rate"] == 88200
示例#13
0
def test_audio_instantiation():
    audio = Audio()
    assert audio.sampling_rate is None
    assert audio.mono is True
    assert audio.id is None
    assert audio.dtype == "dict"
    assert audio.pa_type is None
    assert audio._type == "Audio"
    assert audio._storage_dtype == "string"
示例#14
0
def test_load_dataset_with_audio_feature(streaming, jsonl_audio_dataset_path, shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data_files = jsonl_audio_dataset_path
    features = Features({"audio": Audio(), "text": Value("string")})
    dset = load_dataset("json", split="train", data_files=data_files, features=features, streaming=streaming)
    item = dset[0] if not streaming else next(iter(dset))
    assert item.keys() == {"audio", "text"}
    assert item["audio"].keys() == {"path", "array", "sampling_rate"}
    assert item["audio"]["path"] == audio_path
    assert item["audio"]["array"].shape == (202311,)
    assert item["audio"]["sampling_rate"] == 44100
def test_audio_instantiation():
    audio = Audio()
    assert audio.sampling_rate is None
    assert audio.mono is True
    assert audio.id is None
    assert audio.dtype == "dict"
    assert audio.pa_type == pa.struct({
        "bytes": pa.binary(),
        "path": pa.string()
    })
    assert audio._type == "Audio"
 def test_from_dict(self):
     input_schema = Features({"audio": Audio()})
     label_schema = Features({"transcription": Value("string")})
     template_dict = {
         "audio_column": "input_audio",
         "transcription_column": "input_transcription",
     }
     task = AutomaticSpeechRecognition.from_dict(template_dict)
     self.assertEqual("automatic-speech-recognition", task.task)
     self.assertEqual(input_schema, task.input_schema)
     self.assertEqual(label_schema, task.label_schema)
def test_audio_feature_type_to_arrow():
    features = Features({"audio": Audio()})
    assert features.arrow_schema == pa.schema({"audio": Audio().pa_type})
    features = Features({"struct_containing_an_audio": {"audio": Audio()}})
    assert features.arrow_schema == pa.schema(
        {"struct_containing_an_audio": pa.struct({"audio": Audio().pa_type})})
    features = Features({"sequence_of_audios": Sequence(Audio())})
    assert features.arrow_schema == pa.schema(
        {"sequence_of_audios": pa.list_(Audio().pa_type)})
def test_dataset_with_audio_feature_map_undecoded(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data = {"audio": [audio_path]}
    features = Features({"audio": Audio(decode=False)})
    dset = Dataset.from_dict(data, features=features)

    def assert_audio_example_undecoded(example):
        assert example["audio"] == {"path": audio_path, "bytes": None}

    dset.map(assert_audio_example_undecoded)

    def assert_audio_batch_undecoded(batch):
        for audio in batch["audio"]:
            assert audio == {"path": audio_path, "bytes": None}

    dset.map(assert_audio_batch_undecoded, batched=True)
示例#19
0
def test_formatted_dataset_with_audio_feature(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data = {"audio": [audio_path, audio_path]}
    features = Features({"audio": Audio()})
    dset = Dataset.from_dict(data, features=features)
    with dset.formatted_as("numpy"):
        item = dset[0]
        assert item.keys() == {"audio"}
        assert item["audio"].keys() == {"path", "array", "sampling_rate"}
        assert item["audio"]["path"] == audio_path
        assert item["audio"]["array"].shape == (202311,)
        assert item["audio"]["sampling_rate"] == 44100
        batch = dset[:1]
        assert batch.keys() == {"audio"}
        assert len(batch["audio"]) == 1
        assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"}
        assert batch["audio"][0]["path"] == audio_path
        assert batch["audio"][0]["array"].shape == (202311,)
        assert batch["audio"][0]["sampling_rate"] == 44100
        column = dset["audio"]
        assert len(column) == 2
        assert column[0].keys() == {"path", "array", "sampling_rate"}
        assert column[0]["path"] == audio_path
        assert column[0]["array"].shape == (202311,)
        assert column[0]["sampling_rate"] == 44100

    with dset.formatted_as("pandas"):
        item = dset[0]
        assert item.shape == (1, 1)
        assert item.columns == ["audio"]
        assert item["audio"][0].keys() == {"path", "array", "sampling_rate"}
        assert item["audio"][0]["path"] == audio_path
        assert item["audio"][0]["array"].shape == (202311,)
        assert item["audio"][0]["sampling_rate"] == 44100
        item = dset[:1]
        assert item.shape == (1, 1)
        assert item.columns == ["audio"]
        assert item["audio"][0].keys() == {"path", "array", "sampling_rate"}
        assert item["audio"][0]["path"] == audio_path
        assert item["audio"][0]["array"].shape == (202311,)
        assert item["audio"][0]["sampling_rate"] == 44100
        column = dset["audio"]
        assert len(column) == 2
        assert column[0].keys() == {"path", "array", "sampling_rate"}
        assert column[0]["path"] == audio_path
        assert column[0]["array"].shape == (202311,)
        assert column[0]["sampling_rate"] == 44100
示例#20
0
def test_dataset_with_audio_feature_map_is_not_decoded(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.wav")
    data = {"audio": [audio_path], "text": ["Hello"]}
    features = Features({"audio": Audio(), "text": Value("string")})
    dset = Dataset.from_dict(data, features=features)

    for item in dset:
        assert item.keys() == {"audio", "text"}
        assert item == {"audio": audio_path, "text": "Hello"}

    def process_text(example):
        example["text"] = example["text"] + " World!"
        return example

    processed_dset = dset.map(process_text)
    for item in processed_dset:
        assert item.keys() == {"audio", "text"}
        assert item == {"audio": audio_path, "text": "Hello World!"}
def test_audio_resampling_mp3_different_sampling_rates(shared_datadir):
    audio_path = str(shared_datadir / "test_audio_44100.mp3")
    audio_path2 = str(shared_datadir / "test_audio_16000.mp3")
    audio = Audio(sampling_rate=48000)

    decoded_example = audio.decode_example(audio.encode_example(audio_path))
    assert decoded_example.keys() == {"path", "array", "sampling_rate"}
    assert decoded_example["path"] == audio_path
    assert decoded_example["array"].shape == (119119, )
    assert decoded_example["sampling_rate"] == 48000

    decoded_example = audio.decode_example(audio.encode_example(audio_path2))
    assert decoded_example.keys() == {"path", "array", "sampling_rate"}
    assert decoded_example["path"] == audio_path2
    assert decoded_example["array"].shape == (120960, )
    assert decoded_example["sampling_rate"] == 48000
示例#22
0
def test_audio_instantiation():
    audio = Audio()
    assert audio.id is None
    assert audio.dtype == "dict"
    assert audio.pa_type is None
    assert audio._type == "Audio"