예제 #1
0
def resample_folder(input_folder, output_folder, fs, regex):

    files = get_all_files(input_folder, match_and=[regex])
    torchaudio.initialize_sox()
    for f in tqdm.tqdm(files):

        # we use sox because torchaudio.Resample uses too much RAM.
        resample = torchaudio.sox_effects.SoxEffectsChain()
        resample.append_effect_to_chain("rate", [fs])
        resample.set_input_file(f)

        audio, fs = resample.sox_build_flow_effects()

        audio = (audio / torch.max(torch.abs(audio), dim=-1, keepdim=True)[0]
                 )  # scale back otherwise you get empty .wav file
        os.makedirs(
            Path(
                os.path.join(output_folder,
                             Path(f).relative_to(Path(input_folder)))).parent,
            exist_ok=True,
        )
        torchaudio.save(
            os.path.join(output_folder,
                         Path(f).relative_to(Path(input_folder))),
            audio,
            fs,
        )
    torchaudio.shutdown_sox()
예제 #2
0
def main():
    print('PyTorch', setup.torch_version())
    print('CUDA is available:', setup.cuda_is_available())
    print('CUDA device count:', setup.cuda_device_count())

    directory = 'fma_small'
    batch_size = 8
    num_workers = 8
    dataset = FMA(directory)
    loader = setup.load(dataset, batch_size, num_workers)

    device = setup.device()

    model = setup.parallel(Example())
    model.to(device)

    torchaudio.initialize_sox()
    count = 0
    for batch in loader:
        sound, genre = batch
        sound.to(device)
        genre.to(device)
        count = min(count + batch_size, dataset.__len__())
        print('Loaded', count, '/', dataset.__len__())
    print('Done')
    torchaudio.shutdown_sox()
예제 #3
0
    def __call__(self, wav=None, sr=None):
        assert len(wav.shape) == 1
        _wav = None
        input_dtype = wav.dtype
        try:
            if random.random() < self.prob:
                speed_alpha = 1.0 + self.speed_limit * random.uniform(-1, 1)
                pitch_alpha = self.pitch_limit * random.uniform(-1, 1) * 100 # in cents
                #  https://github.com/carlthome/python-audio-effects/blob/master/pysndfx/dsp.py#L531
                with NamedTemporaryFile(suffix=".wav",
                                        dir=tempfile_dir) as temp_file:
                    temp_filename = temp_file.name
                    # always feed int16 to sox
                    if wav.dtype == np.float32():
                        wav_int = float2int(wav)
                        wav_write(temp_filename,
                                  sr,
                                  wav_int)
                    else:
                        wav_write(temp_filename,
                                  sr,
                                  wav)

                    torchaudio.initialize_sox()
                    effects = torchaudio.sox_effects.SoxEffectsChain()
                    effects.append_effect_to_chain('pitch', pitch_alpha)
                    effects.append_effect_to_chain('tempo', [speed_alpha])
                    effects.append_effect_to_chain('rate', sr)
                    effects.set_input_file(temp_filename)
                    _wav, _sr = effects.sox_build_flow_effects()
                    torchaudio.shutdown_sox()
                    _wav = _wav.numpy().squeeze()
                    assert sr == _sr
                    # always float output
                    if _wav.dtype == np.int16():
                        _wav = int2float(_wav)
        except Exception as e:
            print(str(e))

        if _wav is not None:
            return {'wav': _wav,'sr': sr}
        else:
            return {'wav': wav,'sr': sr}
예제 #4
0
    def sox_build_flow_effects(self,
                               out: Optional[Tensor] = None
                               ) -> Tuple[Tensor, int]:
        r"""Build effects chain and flow effects from input file to output tensor

        Args:
            out (Tensor, optional): Where the output will be written to. (Default: ``None``)

        Returns:
            Tuple[Tensor, int]: An output Tensor of size `[C x L]` or `[L x C]` where L is the number
            of audio frames and C is the number of channels. An integer which is the sample rate of the
            audio (as listed in the metadata of the file)
        """
        # initialize output tensor
        if out is not None:
            torchaudio.check_input(out)
        else:
            out = torch.FloatTensor()
        if not len(self.chain):
            e = SoxEffect()
            e.ename = "no_effects"
            e.eopts = [""]
            self.chain.append(e)

        # print("effect options:", [x.eopts for x in self.chain])

        torchaudio.initialize_sox()
        import _torch_sox
        sr = _torch_sox.build_flow_effects(self.input_file, out,
                                           self.channels_first,
                                           self.out_siginfo, self.out_encinfo,
                                           self.filetype, self.chain,
                                           self.MAX_EFFECT_OPTS)

        torchaudio._audio_normalization(out, self.normalization)

        return out, sr
    def process(self):
        """Process the VCC2016 data if it doesn't exist in processed_folder already."""
        import zipfile

        if self._check_exists():
            return

        raw_abs_dir = os.path.join(self.root, self.raw_folder)
        processed_abs_dir = os.path.join(self.root, self.processed_folder)
        dset_abs_path = os.path.join(self.root, self.raw_folder,
                                     self.dset_path)

        try:
            os.makedirs(os.path.join(self.root, self.processed_folder))
            os.makedirs(os.path.join(self.root, self.raw_folder))
        except OSError as e:
            if e.errno == errno.EEXIST:
                pass
            else:
                raise

        zip_path = self.zip_path
        print('Unzipping', zip_path)
        filename = zip_path.rpartition('/')[2]
        file_path = os.path.join(self.root, self.raw_folder, filename)
        if not os.path.isfile(file_path):
            shutil.copy2(zip_path, file_path)

        if not os.path.exists(dset_abs_path):
            with zipfile.ZipFile(file_path) as zip_f:
                zip_f.extractall(raw_abs_dir)
        else:
            print("Using existing raw folder")
        if not self.dev_mode:
            os.unlink(file_path)

        # process and save as torch files
        torchaudio.initialize_sox()
        print('Processing...')
        shutil.copyfile(os.path.join(dset_abs_path, "README"),
                        os.path.join(processed_abs_dir, "VCC2016_README"))
        audios = make_manifest(dset_abs_path)
        self.ids = load_ids(dset_abs_path)
        print("Found {} audio files".format(len(audios)))

        print('Extracting WORLD features.')
        spectras = []
        aperiodicities = []
        f0s = []
        energies = []
        labels = []
        chunk_id = 0
        samples = 0
        self.speaker_offset_idx = {}
        self.chunk_indices = {}
        current_chunk_start_idx = 0
        prev_speaker = -1
        for f in audios:
            speaker = f.split("/", -1)[-2]
            spectra, aperiodicity, f0_, energy = read_audio_and_extract_features(
                f, trim_silence=self.trim_silence)

            # New speaker, save current chunk and start a fresh chunk
            if prev_speaker != -1 and speaker != prev_speaker:
                self.speaker_offset_idx[self.ids[speaker]] = samples
                print('Speaker {}: start idx: {}'.format(speaker, samples))
                self.chunk_indices[chunk_id] = (current_chunk_start_idx,
                                                samples - 1)
                prev_speaker = speaker
                current_chunk_start_idx = samples

                self.save_WORLD_chunk(chunk_id, spectras, aperiodicities, f0s,
                                      energies, labels)
                chunk_id += 1
                spectras = []
                aperiodicities = []
                f0s = []
                energies = []
                labels = []
            elif prev_speaker == -1:
                prev_speaker = speaker

            # Add each spectral frame as a separate datapoint
            for i in range(spectra.shape[0]):
                sp = torch.tensor(spectra[i]).unsqueeze(0).float()
                ap = torch.tensor(aperiodicity[i]).unsqueeze(0).float()
                f0 = torch.tensor(f0_[i]).float()
                en = torch.tensor(energy[i]).float()

                spectras.append(sp)
                aperiodicities.append(ap)
                f0s.append(f0)
                energies.append(en)
                labels.append(self.ids[speaker])

                samples += 1

        if len(spectras) > 0:
            self.chunk_indices[chunk_id] = (current_chunk_start_idx,
                                            samples - 1)
            self.save_WORLD_chunk(chunk_id, spectras, aperiodicities, f0s,
                                  energies, labels)

        self._write_info(samples)

        # Compute each speaker statistics and add to the info file
        self.extract_dataset_max_min_and_speaker_profiles()

        if not self.dev_mode:
            shutil.rmtree(raw_abs_dir, ignore_errors=True)
        torchaudio.shutdown_sox()
        print('Done!')
예제 #6
0
파일: vctk.py 프로젝트: tjadamlee/audio
    def download(self):
        """Download the VCTK data if it doesn't exist in processed_folder already."""
        from six.moves import urllib
        import tarfile

        if self._check_exists():
            return

        raw_abs_dir = os.path.join(self.root, self.raw_folder)
        processed_abs_dir = os.path.join(self.root, self.processed_folder)
        dset_abs_path = os.path.join(self.root, self.raw_folder,
                                     self.dset_path)

        # download files
        try:
            os.makedirs(os.path.join(self.root, self.processed_folder))
            os.makedirs(os.path.join(self.root, self.raw_folder))
        except OSError as e:
            if e.errno == errno.EEXIST:
                pass
            else:
                raise

        url = self.url
        print('Downloading ' + url)
        filename = url.rpartition('/')[2]
        file_path = os.path.join(self.root, self.raw_folder, filename)
        if not os.path.isfile(file_path):
            urllib.request.urlretrieve(url, file_path)
        if not os.path.exists(dset_abs_path):
            with tarfile.open(file_path) as zip_f:
                zip_f.extractall(raw_abs_dir)
        else:
            print("Using existing raw folder")
        if not self.dev_mode:
            os.unlink(file_path)

        # process and save as torch files
        torchaudio.initialize_sox()
        print('Processing...')
        shutil.copyfile(os.path.join(dset_abs_path, "COPYING"),
                        os.path.join(processed_abs_dir, "VCTK_COPYING"))
        audios = make_manifest(dset_abs_path)
        utterences = load_txts(dset_abs_path)
        self.max_len = 0
        print("Found {} audio files and {} utterences".format(
            len(audios), len(utterences)))
        for n in range(len(audios) // self.chunk_size + 1):
            tensors = []
            labels = []
            lengths = []
            st_idx = n * self.chunk_size
            end_idx = st_idx + self.chunk_size
            for i, f in enumerate(audios[st_idx:end_idx]):
                txt_dir = os.path.dirname(f).replace("wav48", "txt")
                if os.path.exists(txt_dir):
                    f_rel_no_ext = os.path.basename(f).rsplit(".", 1)[0]
                    sig = read_audio(f, downsample=self.downsample)[0]
                    tensors.append(sig)
                    lengths.append(sig.size(1))
                    labels.append(utterences[f_rel_no_ext])
                    self.max_len = sig.size(
                        1) if sig.size(1) > self.max_len else self.max_len
            # sort sigs/labels: longest -> shortest
            tensors, labels = zip(
                *[(b, c) for (a, b, c) in sorted(zip(lengths, tensors, labels),
                                                 key=lambda x: x[0],
                                                 reverse=True)])
            data = (tensors, labels)
            torch.save(
                data,
                os.path.join(self.root, self.processed_folder,
                             "vctk_{:04d}.pt".format(n)))
        self._write_info((n * self.chunk_size) + i + 1)
        if not self.dev_mode:
            shutil.rmtree(raw_abs_dir, ignore_errors=True)
        torchaudio.shutdown_sox()
        print('Done!')
예제 #7
0
 def setUpClass(cls):
     torchaudio.initialize_sox()
    def process(self):
        """Process the VCTK data if it doesn't exist in processed_folder already."""
        import zipfile

        if self._check_exists():
            return

        raw_abs_dir = os.path.join(self.root, self.raw_folder)
        processed_abs_dir = os.path.join(self.root, self.processed_folder)
        dset_abs_path = os.path.join(self.root, self.raw_folder,
                                     self.dset_path)

        try:
            os.makedirs(os.path.join(self.root, self.processed_folder))
            os.makedirs(os.path.join(self.root, self.raw_folder))
        except OSError as e:
            if e.errno == errno.EEXIST:
                pass
            else:
                raise

        zip_path = self.zip_path
        print('Unzipping', zip_path)
        filename = zip_path.rpartition('/')[2]
        file_path = os.path.join(self.root, self.raw_folder, filename)
        if not os.path.isfile(file_path):
            shutil.copy2(zip_path, file_path)

        if not os.path.exists(dset_abs_path):
            with zipfile.ZipFile(file_path) as zip_f:
                zip_f.extractall(raw_abs_dir)
        else:
            print("Using existing raw folder")
        if not self.dev_mode:
            os.unlink(file_path)

        # process and save as torch files
        torchaudio.initialize_sox()
        print('Processing...')
        shutil.copyfile(os.path.join(dset_abs_path, "COPYING"),
                        os.path.join(processed_abs_dir, "VCTK_COPYING"))
        audios = make_manifest(dset_abs_path, self.shuffle_order)
        self.ids = load_ids(dset_abs_path)
        self.max_len = 0
        all_lengths = []
        print("Found {} audio files".format(len(audios)))
        for n in range(len(audios) // self.chunk_size + 1):
            tensors = []
            labels = []
            lengths = []
            st_idx = n * self.chunk_size
            end_idx = st_idx + self.chunk_size
            for i, f in enumerate(audios[st_idx:end_idx]):
                f_rel_no_ext = os.path.basename(f).rsplit(".", 1)[0]
                sig = read_audio(f,
                                 downsample=self.downsample,
                                 trim_silence=self.trim_silence)[0]
                tensors.append(sig)
                lengths.append(sig.size(1))
                labels.append(self.ids[f_rel_no_ext.split('_')[0]])
                self.max_len = sig.size(
                    1) if sig.size(1) > self.max_len else self.max_len
                all_lengths.append(sig.size(1))
            # sort sigs/labels: longest -> shortest
            tensors, labels = zip(
                *[(b, c) for (a, b, c) in sorted(zip(lengths, tensors, labels),
                                                 key=lambda x: x[0],
                                                 reverse=True)])
            data = (tensors, labels)
            torch.save(
                data,
                os.path.join(self.root, self.processed_folder,
                             "vctk_{:04d}.pt".format(n)))
        self.mean_len = np.mean(all_lengths)
        self.std_len = np.std(all_lengths, ddof=1)
        self._write_info((n * self.chunk_size) + i + 1)
        if not self.dev_mode:
            shutil.rmtree(raw_abs_dir, ignore_errors=True)
        torchaudio.shutdown_sox()
        print('Done!')
예제 #9
0
    def __call__(self,
                 audio_path,
                 sample_rate,
                 normalize=True,
                 defaults=dict(pitch=[-300, 300],
                               tempo=[0.8, 1.2],
                               gain=[-10, 10]),
                 tmpdir='/dev/shm'):
        effect = None
        tuple_if_str = lambda t: (t, defaults.get(t)) if isinstance(t, str
                                                                    ) else t
        if self.transforms and random.random() < self.prob:
            transform = tuple_if_str(random.choice(self.transforms))
            effect = (
                [transform[0], fixed_or_choice(transform[1])]
                if transform[0] in defaults else transform[0]) if isinstance(
                    transform, tuple) else []

        tmp_audio_path = []
        if effect and isinstance(effect,
                                 str) and effect.startswith('transcode'):
            codec = effect.split('_')[1]
            tmp_audio_path = [
                tempfile.mkstemp(suffix='.' + codec, dir=tmpdir)[1],
                tempfile.mkstemp(suffix='.wav', dir=tmpdir)[1]
            ]
            subprocess.check_call([
                'sox', '-V0', audio_path, '-t', codec, '-r',
                str(sample_rate), tmp_audio_path[0]
            ])
            if self.bug == 'SoxEffectsChain':
                subprocess.check_call([
                    'sox', '-V0', tmp_audio_path[0], '-t', 'wav',
                    tmp_audio_path[1]
                ])
                audio_path = tmp_audio_path[1]
            else:
                audio_path = tmp_audio_path[0]
            effect = None

        if self.bug == 'SoxEffectsChain':
            torchaudio.initialize_sox()
            sox = torchaudio.sox_effects.SoxEffectsChain()
            if effect:
                sox.append_effect_to_chain(*effect)
            sox.append_effect_to_chain('channels', 1)
            sox.append_effect_to_chain('rate', sample_rate)
            sox.set_input_file(audio_path)
            signal, sample_rate_ = sox.sox_build_flow_effects()
            signal = signal[0]
            sox.clear_chain()
            torchaudio.shutdown_sox()

        elif self.bug == 'as_tensor':
            signal, sample_rate_ = torch.as_tensor(
                bytearray(
                    subprocess.check_output([
                        'sox', '-V0', audio_path, '-b', '16', '-e', 'signed',
                        '--endian', 'little', '-r',
                        str(sample_rate), '-c', '1', '-t', 'raw', '-'
                    ] + ([effect[0], str(effect[1])] if effect else []))),
                dtype=torch.int16), sample_rate

        else:
            signal, sample_rate_ = torch.from_numpy(
                np.frombuffer(subprocess.check_output([
                    'sox', '-V0', audio_path, '-b', '16', '-e', 'signed',
                    '--endian', 'little', '-r',
                    str(sample_rate), '-c', '1', '-t', 'raw', '-'
                ] + ([effect[0], str(effect[1])] if effect else [])),
                              dtype=np.int16)).to(torch.float32), sample_rate

        for audio_path in tmp_audio_path:
            os.remove(audio_path)
        if sample_rate is not None and sample_rate_ != sample_rate:
            signal, sample_rate_ = dataset.resample(signal, sample_rate_,
                                                    sample_rate)
        if normalize:
            signal = models.normalize_signal(signal)
        if effect == []:
            signal, sample_rate = transform(signal, sample_rate)
        return signal, sample_rate
예제 #10
0
def initialize_sox():
    """Initialize sox backend only if it has not yet."""
    global _IS_SOX_INITIALIZED
    if not _IS_SOX_INITIALIZED:
        torchaudio.initialize_sox()
        _IS_SOX_INITIALIZED = True
예제 #11
0
    def process(self):
        """Process the VCC2016 data if it doesn't exist in processed_folder already."""
        import zipfile

        if self._check_exists():
            return

        raw_abs_dir = os.path.join(self.root, self.raw_folder)
        processed_abs_dir = os.path.join(self.root, self.processed_folder)
        dset_abs_path = os.path.join(self.root, self.raw_folder,
                                     self.dset_path)

        try:
            os.makedirs(os.path.join(self.root, self.processed_folder))
            os.makedirs(os.path.join(self.root, self.raw_folder))
        except OSError as e:
            if e.errno == errno.EEXIST:
                pass
            else:
                raise

        zip_path = self.zip_path
        print('Unzipping', zip_path)
        filename = zip_path.rpartition('/')[2]
        file_path = os.path.join(self.root, self.raw_folder, filename)
        if not os.path.isfile(file_path):
            shutil.copy2(zip_path, file_path)

        if not os.path.exists(dset_abs_path):
            with zipfile.ZipFile(file_path) as zip_f:
                zip_f.extractall(raw_abs_dir)
        else:
            print("Using existing raw folder")
        if not self.dev_mode:
            os.unlink(file_path)

        # process and save as torch files
        torchaudio.initialize_sox()
        print('Processing...')
        shutil.copyfile(os.path.join(dset_abs_path, "README"),
                        os.path.join(processed_abs_dir, "VCC2016_README"))
        audios = make_manifest(dset_abs_path)
        self.ids = load_ids(dset_abs_path)
        print("Found {} audio files".format(len(audios)))

        print('Splitting samples to length {}'.format(self.sample_length))
        tensors = []
        labels = []
        chunk_id = 0
        samples = 0
        self.speaker_offset_idx = {}
        prev_speaker = -1
        for f in audios:
            speaker = f.split("/", -1)[-2]
            sig, _ = read_audio(f, trim_silence=self.trim_silence)

            # New speaker, save current chunk and start a fresh chunk
            if prev_speaker == -1 or speaker != prev_speaker:
                self.speaker_offset_idx[self.ids[speaker]] = samples
                print('Speaker {}: start idx: {}'.format(speaker, samples))
                prev_speaker = speaker

            length = sig.size(1)
            # Cut the end of the sample if its too long to be equally split.
            if length % self.sample_length > 0:
                sig = sig[:, :length - (length % self.sample_length)]

            # Split samples
            sigs = sig.view(-1, self.sample_length)
            for sig in sigs:
                sig = sig.unsqueeze(0)
                tensors.append(sig)
                labels.append(self.ids[speaker])
                self.max_len = sig.size(
                    1) if sig.size(1) > self.max_len else self.max_len
                samples += 1
                # Save to chunk-file
                # if len(tensors) == self.chunk_size:
                #     self.save_chunk(chunk_id, lengths, tensors, labels)
                #     chunk_id += 1
                #     tensors = []
                #     labels = []

        # Save all to one chunk-file
        if len(tensors) > 0:
            self.save_raw_chunk(chunk_id, tensors, labels)

        self._write_info(samples)

        if not self.dev_mode:
            shutil.rmtree(raw_abs_dir, ignore_errors=True)
        torchaudio.shutdown_sox()
        print('Done!')
예제 #12
0
    parser.add_argument('--backend', default='nccl', type=str)
    parser.add_argument('--init-method', default='env://', type=str)
    parser.add_argument('--local-rank', '--local_rank', '--gpu', default=0, type=int)
    parser.add_argument('--sync-bn', action='store_true', default=False)

    # F16 training
    parser.add_argument('--opt-level', default='O0', type=str, choices=['O0', 'O1', 'O2', 'O3'])
    parser.add_argument('--keep-batchnorm-fp32', default=None, action='store_true')
    parser.add_argument('--loss-scale', type=str, default=None)

    parser.add_argument('--verbosity', '-v', action='count', default=0)

    args = parser.parse_args()

    # Initialize sox
    torchaudio.initialize_sox()

    args.world_size = 1

    # Pin GPU to be used to process local rank (one GPU per process)
    torch.cuda.set_device(args.local_rank)

    if args.deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    if args.distributed: