def __getitem__(self, idx):
        y_label, x1_path, x2_path = self.dataset[idx]

        wav1, _ = apply_effects_file(x1_path, EFFECTS)
        wav2, _ = apply_effects_file(x2_path, EFFECTS)

        wav1 = wav1.squeeze(0)
        wav2 = wav2.squeeze(0)

        length1 = wav1.shape[0]
        length2 = wav2.shape[0]

        return wav1.numpy(), wav2.numpy(), length1, length2, int(y_label[0])
Exemplo n.º 2
0
    def __getitem__(self, idx):
        y_label, x1_path, x2_path = self.dataset[idx]

        def path2name(path):
            return Path("-".join((Path(path).parts)[-3:])).stem

        x1_name = path2name(x1_path)
        x2_name = path2name(x2_path)

        wav1, _ = apply_effects_file(x1_path, EFFECTS)
        wav2, _ = apply_effects_file(x2_path, EFFECTS)

        wav1 = wav1.squeeze(0)
        wav2 = wav2.squeeze(0)

        return wav1.numpy(), wav2.numpy(), x1_name, x2_name, int(y_label[0])
Exemplo n.º 3
0
def main():
    args = docopt(__doc__)
    input_sqlite = args["--input_sqlite"]
    job_id = args["--job_id"]
    if not os.path.exists(job_id):
        os.mkdir(job_id)
    conn = sqlite3.connect(input_sqlite)
    cur = conn.cursor()
    cur.execute("SELECT * FROM data")
    rows = cur.fetchall()
    input_df = []
    num = 0
    for i, row in tqdm(enumerate(rows)):
        try:
            audio_url = json.loads(row[1])["audio_url"]
            intent_label = json.loads(row[2])[0]["type"]
            path = audio_url_to_file(audio_url, job_id)
            path = flac2wav(path)
            wav, _ = apply_effects_file(str(path), EFFECTS)
            base_path = "scripts/"
            input_df.append([base_path + path, intent_label])
        except:
            num += 1
            print(i)
            pass
    input_df = pd.DataFrame(input_df, columns=["audio_path", "label"])
    input_df.to_csv(f"{job_id}.csv", index=False)
    print("total audios skipped :", num)
Exemplo n.º 4
0
    def segment_processing(self):
        wav_list = self.pair_dict['wav_table']
        utterance_id = 0
        segment_list = []
        print("processing test set to segments")
        for wav_info in tqdm.tqdm(wav_list):
            label_info = wav_info[0]
            pair_info = wav_info[1]

            wav, _ = apply_effects_file(wav_info[2], EFFECTS)
            wav = wav.squeeze(0)

            index_end = len(wav) - self.segment_config["window"]
            segment_num = index_end // self.segment_config['stride']

            if index_end < 0:
                segment_list.append([
                    int(label_info), pair_info,
                    str(utterance_id), segment_num, 0,
                    len(wav), wav_info[2]
                ])
            else:
                for index in range(0, index_end,
                                   self.segment_config['stride']):
                    segment_list.append([
                        int(label_info), pair_info,
                        str(utterance_id), segment_num, index,
                        index + self.segment_config['window'], wav_info[2]
                    ])

            utterance_id += 1

        return segment_list
Exemplo n.º 5
0
    def test_requests(self, ext, compression):
        sample_rate = 16000
        channels_first = True
        effects = [['band', '300', '10']]
        format_ = ext if ext in ['mp3'] else None
        audio_file = f'input.{ext}'
        input_path = self.get_temp_path(audio_file)
        reference_path = self.get_temp_path('reference.wav')

        sox_utils.gen_audio_file(input_path,
                                 sample_rate,
                                 num_channels=2,
                                 compression=compression)
        sox_utils.run_sox_effect(input_path,
                                 reference_path,
                                 effects,
                                 output_bitdepth=32)
        expected, expected_sr = load_wav(reference_path)

        url = self.get_url(audio_file)
        with requests.get(url, stream=True) as resp:
            found, sr = sox_effects.apply_effects_file(
                resp.raw,
                effects,
                channels_first=channels_first,
                format=format_)
        save_wav(self.get_temp_path('result.wav'),
                 found,
                 sr,
                 channels_first=channels_first)
        assert sr == expected_sr
        self.assertEqual(found, expected)
Exemplo n.º 6
0
def _load_wav_with_speed(wav_file, speed):
    """ Load the wave from file and apply speed perpturbation

    Args:
        wav_file: input feature, T * F 2D

    Returns:
        augmented feature
    """
    if speed == 1.0:
        wav, sr = torchaudio.load(wav_file)
    else:
        sample_rate = torchaudio.backend.sox_io_backend.info(
            wav_file).sample_rate
        # get torchaudio version
        ta_no = torchaudio.__version__.split(".")
        ta_version = 100 * int(ta_no[0]) + 10 * int(ta_no[1])

        if ta_version < 80:
            # Note: deprecated in torchaudio>=0.8.0
            E = sox_effects.SoxEffectsChain()
            E.append_effect_to_chain('speed', speed)
            E.append_effect_to_chain("rate", sample_rate)
            E.set_input_file(wav_file)
            wav, sr = E.sox_build_flow_effects()
        else:
            # Note: enable in torchaudio>=0.8.0
            wav, sr = sox_effects.apply_effects_file(
                wav_file,
                [['speed', str(speed)], ['rate', str(sample_rate)]])

    return wav, sr
Exemplo n.º 7
0
def main():
    args = docopt(__doc__)
    input_json = args["--input_json"]
    audio_dir = args["--audio_dir"]
    if os.path.exists(audio_dir):
        os.remove(audio_dir)
    os.mkdir(audio_dir)
    with open(input_json,'r') as f:
        rows = json.load(f)

    input_df = []
    num = 0
    for i,row in tqdm(enumerate(rows)):
        audio_url = row["data"]["audio_url"]
        print(audio_url)
        intent_label = row["intent"]
        alternatives = "</s></s>".join([x['transcript'] for x in row["data"]["alternatives"][0]])
        path = audio_url_to_file(audio_url,audio_dir)
        path = flac2wav(path)
        wav, _ = apply_effects_file(str(path), EFFECTS)
        base_path = "scripts/"
        input_df.append([base_path+path,intent_label,alternatives])
        '''
        except:
            num += 1
            print(i)
            pass  
        '''
    input_df = pd.DataFrame(input_df,columns=["audio_path","label","alternatives"])           
    input_df.to_csv(f"{audio_dir}.csv", index=False)
    print("total audios skipped :", num)
Exemplo n.º 8
0
def find_queries(query_dir_path):
    """Find all queries under sws2013_dev & sws2013_eval."""

    # e.g. "sws2013_dev_123.wav" or "sws2013_dev_123_01.wav" -> "sws2013_dev_123"
    pattern = re.compile(r"(_[0-9]{2})?\.wav")

    query2tensors = defaultdict(list)
    for query_path in tqdm(list(query_dir_path.glob("*.wav")),
                           ncols=0,
                           desc="Load queries"):
        query_name = pattern.sub("", query_path.name)
        wav_tensor, sample_rate = apply_effects_file(
            str(query_path), [["channels", "1"], ["rate", "16000"], ["norm"]])
        trimmed, _ = apply_effects_tensor(
            wav_tensor,
            sample_rate,
            [
                ["vad", "-T", "0.25", "-p", "0.1"],
                ["reverse"],
                ["vad", "-T", "0.25", "-p", "0.1"],
                ["reverse"],
            ],
        )
        wav_tensor = trimmed if trimmed.size(1) >= (sample_rate *
                                                    0.5) else wav_tensor
        wav_tensor = wav_tensor.squeeze(0)
        query2tensors[query_name].append(wav_tensor)

    return query2tensors
Exemplo n.º 9
0
    def test_tarfile(self, ext, compression):
        """Applying effects to compressed audio via file-like file works"""
        sample_rate = 16000
        channels_first = True
        effects = [['band', '300', '10']]
        format_ = ext if ext in ['mp3'] else None
        audio_file = f'input.{ext}'

        input_path = self.get_temp_path(audio_file)
        reference_path = self.get_temp_path('reference.wav')
        archive_path = self.get_temp_path('archive.tar.gz')

        sox_utils.gen_audio_file(
            input_path, sample_rate, num_channels=2, compression=compression)
        sox_utils.run_sox_effect(
            input_path, reference_path, effects, output_bitdepth=32)
        expected, expected_sr = load_wav(reference_path)

        with tarfile.TarFile(archive_path, 'w') as tarobj:
            tarobj.add(input_path, arcname=audio_file)
        with tarfile.TarFile(archive_path, 'r') as tarobj:
            fileobj = tarobj.extractfile(audio_file)
            found, sr = sox_effects.apply_effects_file(
                fileobj, effects, channels_first=channels_first, format=format_)
        save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
        assert sr == expected_sr
        self.assertEqual(found, expected)
Exemplo n.º 10
0
    def test_apply_effects_file(self, args):
        effects = args['effects']
        channels_first = True
        num_channels = args.get("num_channels", 2)
        input_sr = args.get("input_sample_rate", 8000)

        trans = SoxEffectFileTransform(effects, channels_first)

        path = self.get_temp_path('sox_effect.zip')
        torch.jit.script(trans).save(path)
        trans = torch.jit.load(path)

        path = self.get_temp_path('input.wav')
        wav = get_sinusoid(frequency=800,
                           sample_rate=input_sr,
                           n_channels=num_channels,
                           dtype='float32',
                           channels_first=channels_first)
        save_wav(path,
                 wav,
                 sample_rate=input_sr,
                 channels_first=channels_first)

        found, sr_found = trans(path)
        expected, sr_expected = sox_effects.apply_effects_file(
            path, effects, channels_first)

        assert sr_found == sr_expected
        self.assertEqual(expected, found)
Exemplo n.º 11
0
    def test_apply_effects_path(self):
        """`apply_effects_file` should return identical data as sox command when file path is given as a Path Object"""
        dtype = 'int32'
        channels_first = True
        effects = [["hilbert"]]
        num_channels = 2
        input_sr = 8000
        output_sr = 8000

        input_path = self.get_temp_path('input.wav')
        reference_path = self.get_temp_path('reference.wav')
        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
        save_wav(input_path, data, input_sr, channels_first=channels_first)
        sox_utils.run_sox_effect(input_path,
                                 reference_path,
                                 effects,
                                 output_sample_rate=output_sr)

        expected, expected_sr = load_wav(reference_path)
        found, sr = sox_effects.apply_effects_file(
            Path(input_path),
            effects,
            normalize=False,
            channels_first=channels_first)

        assert sr == expected_sr
        self.assertEqual(found, expected)
Exemplo n.º 12
0
    def test_apply_effects_str(self, args):
        """`apply_effects_file` should return identical data as sox command"""
        dtype = 'int32'
        channels_first = True
        effects = args['effects']
        num_channels = args.get("num_channels", 2)
        input_sr = args.get("input_sample_rate", 8000)
        output_sr = args.get("output_sample_rate")

        input_path = self.get_temp_path('input.wav')
        reference_path = self.get_temp_path('reference.wav')
        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
        save_wav(input_path, data, input_sr, channels_first=channels_first)
        sox_utils.run_sox_effect(input_path,
                                 reference_path,
                                 effects,
                                 output_sample_rate=output_sr)

        expected, expected_sr = load_wav(reference_path)
        found, sr = sox_effects.apply_effects_file(
            input_path,
            effects,
            normalize=False,
            channels_first=channels_first)

        assert sr == expected_sr
        self.assertEqual(found, expected)
Exemplo n.º 13
0
def _load_wav_with_speed(wav_file, speed):
    """ Load the wave from file and apply speed perpturbation

    Args:
        wav_file: input feature, T * F 2D

    Returns:
        augmented feature
    """
    if speed == 1.0:
        return torchaudio.load_wav(wav_file)
    else:
        si, _ = torchaudio.info(wav_file)

        # get torchaudio version
        ta_no = torchaudio.__version__.split(".")
        ta_version = 100 * int(ta_no[0]) + 10 * int(ta_no[1])

        if ta_version < 80:
            # Note: deprecated in torchaudio>=0.8.0
            E = sox_effects.SoxEffectsChain()
            E.append_effect_to_chain('speed', speed)
            E.append_effect_to_chain("rate", si.rate)
            E.set_input_file(wav_file)
            wav, sr = E.sox_build_flow_effects()
        else:
            # Note: enable in torchaudio>=0.8.0
            wav, sr = sox_effects.apply_effects_file(
                wav_file,
                [['speed', str(speed)], ['rate', str(si.rate)]])

        # sox will normalize the waveform, scale to [-32768, 32767]
        wav = wav * (1 << 15)
        return wav, sr
Exemplo n.º 14
0
    def test_bytesio(self, ext, compression):
        """Applying effects via BytesIO object works"""
        sample_rate = 16000
        channels_first = True
        effects = [['band', '300', '10']]
        format_ = ext if ext in ['mp3'] else None
        input_path = self.get_temp_path(f'input.{ext}')
        reference_path = self.get_temp_path('reference.wav')

        sox_utils.gen_audio_file(input_path,
                                 sample_rate,
                                 num_channels=2,
                                 compression=compression)
        sox_utils.run_sox_effect(input_path,
                                 reference_path,
                                 effects,
                                 output_bitdepth=32)
        expected, expected_sr = load_wav(reference_path)

        with open(input_path, 'rb') as file_:
            fileobj = io.BytesIO(file_.read())
        found, sr = sox_effects.apply_effects_file(
            fileobj, effects, channels_first=channels_first, format=format_)
        save_wav(self.get_temp_path('result.wav'),
                 found,
                 sr,
                 channels_first=channels_first)
        assert sr == expected_sr
        self.assertEqual(found, expected)
Exemplo n.º 15
0
    def __init__(self, vad_config, key_list, file_path, meta_data, max_timestep=None):
    
        self.roots = file_path
        self.root_key = key_list
        self.max_timestep = max_timestep
        self.vad_c = vad_config 
        self.dataset = []
        self.all_speakers = []

        for index in range(len(self.root_key)):
            cache_path = Path(os.path.dirname(__file__)) / 'cache_wav_paths' / f'cache_{self.root_key[index]}.p'
            p = Path(self.roots[index])

            # loca cache_path if file exists
            if os.path.isfile(cache_path):
                # cache dict: 
                # {
                #   "speaker_id1": ["wav_a_path1", "wav_a_path2", ...],
                #   "speaker_id2": ["wav_b_path1", "wav_b_path2", ...],
                #   ...,
                # }
                cache_wavs_dict = pickle.load(open(cache_path,"rb"))
                self.all_speakers.extend(list(cache_wavs_dict.keys()))
                for speaker_id in list(cache_wavs_dict.keys()):
                    for wavs in cache_wavs_dict[speaker_id]:
                        utterance_id = "/".join(str(p/speaker_id/wavs).split("/")[-3:]).replace(".wav","").replace("/","-")                        
                        self.dataset.append([str(p / speaker_id / wavs), utterance_id])

            else:
                speaker_wav_dict = {}
                speaker_dirs = [f.path.split("/")[-1] for f in os.scandir(self.roots[index]) if f.is_dir()]
                self.all_speakers.extend(speaker_dirs)

                print("search all wavs paths")
                start = time.time()
                for speaker in tqdm.tqdm(speaker_dirs):
                    speaker_dir =  p / speaker
                    wav_list=find_files(speaker_dir)
                    speaker_wav_dict[speaker] = []
                    for wav in wav_list:
                        wav_sample, _ = apply_effects_file(str(speaker_dir/wav), EFFECTS)
                        wav_sample = wav_sample.squeeze(0)
                        length = wav_sample.shape[0]

                        if length > self.vad_c['min_sec']:
                            utterance_id = "/".join(str(speaker_dir/wav).split("/")[-3:]).replace(".wav","").replace("/","-") 
                            self.dataset.append([str(speaker_dir/wav), utterance_id])
                            speaker_wav_dict[speaker].append("/".join(wav.split("/")[-2:]))
                end = time.time()

                print(f"search all wavs paths costs {end-start} seconds")
                print(f"save wav paths to {cache_path}! so we can directly load all_path in next time!")
                pickle.dump(speaker_wav_dict, open(cache_path,"wb"))    

        self.speaker_num = len(self.all_speakers)
        self.necessary_dict = self.processing()
        self.label_mapping_spk_id = {}
        # speaker id  map to speaker num
        self.build_label_mapping()
        self.label=self.build_label(self.dataset)
Exemplo n.º 16
0
def path2tensor(filepath):
    tensor, _ = apply_effects_file(
        str(filepath),
        [
            ["channels", "1"],
            ["rate", "16000"],
            ["norm"],
        ],
    )
    return tensor.squeeze(0)
    def __getitem__(self, idx):
        wav, _ = apply_effects_file(self.dataset[idx][0], EFFECTS)
        wav = wav.squeeze(0)
        length = wav.shape[0]

        if self.max_timestep != None:
            if length > self.max_timestep:
                start = random.randint(0, int(length - self.max_timestep))
                wav = wav[start:start + self.max_timestep]

        return wav.numpy(), self.dataset[idx][1], self.label[idx]
Exemplo n.º 18
0
    def test_apply_no_effect(self, dtype, sample_rate, num_channels, channels_first):
        """`apply_effects_file` without effects should return identical data as input"""
        path = self.get_temp_path('input.wav')
        expected = get_wav_data(dtype, num_channels, channels_first=channels_first)
        save_wav(path, expected, sample_rate, channels_first=channels_first)

        found, output_sample_rate = sox_effects.apply_effects_file(
            path, [], normalize=False, channels_first=channels_first)

        assert output_sample_rate == sample_rate
        self.assertEqual(expected, found)
Exemplo n.º 19
0
    def __getitem__(self, idx):
        wav, _ = apply_effects_file(str(self.data_list[idx][0]), EFFECTS)
        wav = wav.squeeze(0)
        length = wav.shape[0]

        if self.max_timestep is not None:
            if length > self.max_timestep:
                start = random.randint(0, int(length - self.max_timestep))
                wav = wav[start:start + self.max_timestep]
                length = self.max_timestep

        return wav, self.data_list[idx][1]
Exemplo n.º 20
0
 def __getitem__(self, idx):
     audio_path = self.data[idx]
     wav, _ = apply_effects_file(
         str(audio_path),
         [
             ["channels", "1"],
             ["rate", "16000"],
             ["gain", "-3.0"],
         ],
     )
     wav = wav.squeeze(0)
     return wav.numpy(), audio_path.with_suffix("").name
Exemplo n.º 21
0
    def test_mp3(self):
        """Providing format allows to read mp3 without extension

        libsox does not check header for mp3

        https://github.com/pytorch/audio/issues/1040

        The file was generated with the following command
            ffmpeg -f lavfi -i "sine=frequency=1000:duration=5" -ar 16000 -f mp3 test_noext
        """
        effects = [['band', '300', '10']]
        path = get_asset_path("mp3_without_ext")
        _, sr = sox_effects.apply_effects_file(path, effects, format="mp3")
        assert sr == 16000
Exemplo n.º 22
0
    def test_apply_effects_file(self, args):
        """`apply_effects_file` should return identical data as sox command"""
        dtype = 'int32'
        channels_first = True
        effects = args['effects']
        num_channels = args.get("num_channels", 2)
        input_sr = args.get("input_sample_rate", 8000)

        input_path = self.get_temp_path('input.wav')
        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
        save_wav(input_path, data, input_sr, channels_first=channels_first)

        _found, _sr = sox_effects.apply_effects_file(
            input_path, effects, normalize=False, channels_first=channels_first)
Exemplo n.º 23
0
    def __getitem__(self, idx):
        path = self.dataset[idx]
        wav, _ = apply_effects_file(str(path), EFFECTS)
        wav = wav.squeeze(0)
        length = wav.shape[0]

        if self.max_timestep != None:
            if length > self.max_timestep:
                start = random.randint(0, int(length - self.max_timestep))
                wav = wav[start:start + self.max_timestep]

        tags = Path(path).parts[-3:]
        utterance_id = "-".join(tags).replace(".wav", "")
        label = self.all_speakers.index(tags[0])
        return wav.numpy(), utterance_id, label
Exemplo n.º 24
0
 def __getitem__(self, idx):
     audio_path = self.data[idx]
     wav, _ = apply_effects_file(
         str(audio_path),
         [
             ["channels", "1"],
             ["rate", "16000"],
             ["norm"],
             ["vad", "-T", "0.25", "-p", "0.1"],
             ["reverse"],
             ["vad", "-T", "0.25", "-p", "0.1"],
             ["reverse"],
             ["pad", "0", "3"],
         ],
     )
     segments = wav.squeeze(0).unfold(0, 48000, 12000).unbind(0)
     return segments, len(segments), audio_path.with_suffix("").name
Exemplo n.º 25
0
    def test_wav(self, dtype, sample_rate, num_channels):
        """`apply_effects_file` works on various wav format"""
        channels_first = True
        effects = [['band', '300', '10']]

        input_path = self.get_temp_path('input.wav')
        reference_path = self.get_temp_path('reference.wav')
        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
        save_wav(input_path, data, sample_rate, channels_first=channels_first)
        sox_utils.run_sox_effect(input_path, reference_path, effects)

        expected, expected_sr = load_wav(reference_path)
        found, sr = sox_effects.apply_effects_file(
            input_path, effects, normalize=False, channels_first=channels_first)

        assert sr == expected_sr
        self.assertEqual(found, expected)
Exemplo n.º 26
0
    def test_vorbis(self, sample_rate, num_channels):
        """`apply_effects_file` works on various vorbis format"""
        channels_first = True
        effects = [['band', '300', '10']]

        input_path = self.get_temp_path('input.vorbis')
        reference_path = self.get_temp_path('reference.wav')
        sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
        sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)

        expected, expected_sr = load_wav(reference_path)
        found, sr = sox_effects.apply_effects_file(
            input_path, effects, channels_first=channels_first)
        save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)

        assert sr == expected_sr
        self.assertEqual(found, expected)
Exemplo n.º 27
0
    def __getitem__(self, idx):
        wav_name = self.wav_list[idx]
        wav_path = self.wav_dir / wav_name
        wav, _ = apply_effects_file(
            str(wav_path),
            [
                ["channels", "1"],
                ["rate", "16000"],
                ["norm"],
            ],
        )

        wav = wav.view(-1)
        wav_segments = unfold_segments(wav, self.segments_durations)
        system_name = wav_name.name.split("_")[0]

        return wav_segments, system_name
Exemplo n.º 28
0
    def __getitem__(self, idx):
        wav_name, mean, mos, judge_id = self.dataframe.loc[idx]
        wav_path = self.base_path / "Converted_speech_of_submitted_systems" / wav_name
        wav, _ = apply_effects_file(
            str(wav_path),
            [
                ["channels", "1"],
                ["rate", "16000"],
                ["norm"],
            ],
        )

        wav = wav.view(-1)
        wav_segments = unfold_segments(wav, self.segments_durations)
        system_name = wav_name[:3] + wav_name[-8:-4]

        return wav_segments, mean, system_name, mos, judge_id
Exemplo n.º 29
0
def path2segment(filepath, src_dur, tgt_dur, offset):
    random_shift = random.uniform(0, src_dur - tgt_dur)
    audio_tensor, _ = apply_effects_file(
        str(filepath),
        [
            ["channels", "1"],
            ["rate", "16000"],
            ["norm"],
            ["pad", f"{tgt_dur}", f"{tgt_dur}"],
            [
                "trim",
                f"{tgt_dur + offset + random_shift}",
                f"{tgt_dur}",
            ],
        ],
    )
    return audio_tensor
Exemplo n.º 30
0
 def __getitem__(self, idx):
     audio_name = self.data[idx]
     audio_path = ((self.dataset_root / f"{self.split}_queries" /
                    audio_name) if idx < self.n_queries else
                   (self.dataset_root / "Audio" / audio_name))
     audio_path = audio_path.with_suffix(".wav")
     wav, _ = apply_effects_file(
         str(audio_path),
         [
             ["channels", "1"],
             ["rate", "16000"],
             ["norm"],
             ["vad", "-T", "0.25", "-p", "0.1"],
             ["reverse"],
             ["vad", "-T", "0.25", "-p", "0.1"],
             ["reverse"],
             ["pad", "0", "3"],
         ],
     )
     segments = wav.squeeze(0).unfold(0, 48000, 12000).unbind(0)
     return segments, len(segments), audio_name