def parse_audio(audio_path: str, audio_extension: str = 'pcm') -> Tensor:
    sound = load_audio(audio_path, extension=audio_extension)

    melspectrogram = librosa.feature.melspectrogram(sound,
                                                    sr=16000,
                                                    n_mels=80,
                                                    n_fft=320,
                                                    hop_length=160)
    log_melspectrogram = librosa.amplitude_to_db(melspectrogram)
    log_melspectrogram = torch.FloatTensor(log_melspectrogram)

    return log_melspectrogram
Exemplo n.º 2
0
 def __init__(self, *args, **kwargs):
     """
     SpectrogramDataset that splits utterances into buckets based on their length.
     Bucketing is done via numpy's histogram method.
     Used by BucketingSampler to sample utterances from the same bin.
     """
     super(SpectrogramDatasetWithLength, self).__init__(*args, **kwargs)
     audio_paths = [path for (path, _) in self.ids]
     audio_lengths = [len(load_audio(path)) for path in audio_paths]
     hist, bin_edges = np.histogram(audio_lengths, bins="auto")
     audio_samples_indices = np.digitize(audio_lengths, bins=bin_edges)
     self.bins_to_samples = defaultdict(list)
     for idx, bin_id in enumerate(audio_samples_indices):
         self.bins_to_samples[bin_id].append(idx)
Exemplo n.º 3
0
""" 
This File is to inject noise on the training data tp increase robustness 

"""


import argparse

import torch
import torchaudio

from data.data_loader import load_audio, NoiseInjection

parser = argparse.ArgumentParser()
parser.add_argument('--input-path', default='input.wav', help='The input audio to inject noise into')
parser.add_argument('--noise-path', default='noise.wav', help='The noise file to mix in')
parser.add_argument('--output-path', default='output.wav', help='The noise file to mix in')
parser.add_argument('--sample-rate', default=16000, help='Sample rate to save output as')
parser.add_argument('--noise-level', type=float, default=1.0,
                    help='The Signal to Noise ratio (higher means more noise)')
args = parser.parse_args()

noise_injector = NoiseInjection()
data = load_audio(args.input_path)
mixed_data = noise_injector.inject_noise_sample(data, args.noise_path, args.noise_level)
mixed_data = torch.tensor(mixed_data, dtype=torch.float).unsqueeze(1)  # Add channels dim
torchaudio.save(args.output_path, mixed_data, args.sample_rate)
print('Saved mixed file to %s' % args.output_path)
Exemplo n.º 4
0
import argparse

import torch
import torchaudio

from data.data_loader import load_audio, NoiseInjection

parser = argparse.ArgumentParser()
parser.add_argument('--input-path', default='input.wav', help='The input audio to inject noise into')
parser.add_argument('--noise-path', default='noise.wav', help='The noise file to mix in')
parser.add_argument('--output-path', default='output.wav', help='The noise file to mix in')
parser.add_argument('--sample-rate', default=16000, help='Sample rate to save output as')
parser.add_argument('--noise-level', type=float, default=1.0,
                    help='The Signal to Noise ratio (higher means more noise)')
args = parser.parse_args()

noise_injector = NoiseInjection()
data = load_audio(args.input_path)
mixed_data = noise_injector.inject_noise_sample(data, args.noise_path, args.noise_level)
mixed_data = torch.tensor(mixed_data, dtype=torch.float).unsqueeze(1)  # Add channels dim
torchaudio.save(args.output_path, mixed_data, args.sample_rate)
print('Saved mixed file to %s' % args.output_path)
Exemplo n.º 5
0
def process_sample(q, samples, args, labels, invalid_counters):
    logger = logging.getLogger('data_prep')
    while True:
        score_path = q.get()
        if score_path is None:
            break

        # Errors counters, for debug.
        # Per file:
        clean_errors = 0
        split_errors = 0
        # Per chunk:
        tiefix_errors = 0
        hum2mid_errors = 0
        audio_errors = 0
        krnseq_errors = 0
        double_symbol_errors = 0
        encoding_errors = 0
        length_errors = 0

        # Remove grace notes, ornaments, etc...
        kern = Kern(Path(args.data_dir) / score_path,
                    remove_splits=args.remove_splits)
        kern.spines.override_instruments(args.instruments)
        try:
            if not kern.clean():
                logger.error(f'Cannot clean kern {score_path}')
                clean_errors += 1
                continue
        except Exception as e:
            logger.exception(
                f"Exception while cleaning {score_path} audio. Reason: {e}")
            clean_errors += 1
            continue

        root_path = Path(args.out_dir) / score_path.parent
        root_path.mkdir(parents=True, exist_ok=True)

        krn_path = Path(args.out_dir) / score_path
        krn_path_clean = krn_path.with_suffix('.clean.krn')
        kern.save(krn_path_clean)

        # Set seed to ensure same chunk sizes and tempo scaling
        np.random.seed(bytearray(score_path.name, 'utf-8'))

        try:
            kern_chunks = kern.split(args.chunk_sizes, args.train_stride)
        except Exception as e:
            logger.exception(f'Exception {e} while splitting {score_path}')
            split_errors += 1
            continue

        # random scale between +ts and -ts
        ts = 1 + args.tempo_scaling * (2 * np.random.rand(len(kern_chunks)) -
                                       1)
        for i, kern in enumerate(kern_chunks):
            chunk_path = krn_path.with_suffix(f'.{i:03d}.krn')
            kern.save(chunk_path)

            # Fix ties with tiefix command
            process = subprocess.run(['tiefix', chunk_path],
                                     capture_output=True,
                                     encoding='iso-8859-1')
            if (process.returncode != 0):
                logger.error(
                    f"tiefix error={process.returncode} on {chunk_path}")
                logger.error(process.stdout)
                tiefix_errors += 1
                continue

            kern = Kern(data=process.stdout, remove_splits=args.remove_splits)
            kern.save(chunk_path)

            audio_path = chunk_path.with_suffix('.flac')

            if args.resynthesize or not audio_path.exists():
                mid_path = chunk_path.with_suffix('.mid')
                # Tempo and instrumment extracted from *MM and *I indications
                status = os.system(
                    f'hum2mid {str(chunk_path)} -C -v 100 -t {ts[i]} -o {str(mid_path)} >/dev/null 2>&1'  # noqa E501
                )
                if (os.WEXITSTATUS(status) != 0):
                    logger.error(f"hum2mid error={status} on {chunk_path}")
                    hum2mid_errors += 1
                    continue

                status = os.system(
                    f'fluidsynth --sample-rate={args.sample_rate} -O s16 -T raw -i -l -F - {args.soundfont} {str(mid_path)} | '  # noqa E501
                    f'ffmpeg -y -f s16le -ar {args.sample_rate} -ac 2 -i pipe: '  # noqa E501
                    f'-ar {args.sample_rate} -ac 1 -ab {args.bit_rate} -strict -2 {str(audio_path)} 2>/dev/null'  # noqa E501
                )

            try:
                y = load_audio(str(audio_path))
            except Exception as e:
                logger.exception(
                    f"Exception while loading {chunk_path} audio. Reason: {e}")
                audio_errors += 1
                continue

            duration = len(y) / args.sample_rate

            try:
                krnseq = kern.tosequence()
            except Exception as e:
                logger.exception(f"Discarded {chunk_path} due to error in kern"
                                 f" sequence conversion. Reason {e}")
                krnseq_errors += 1
                continue

            if krnseq is None:
                logger.warning(
                    f"Discarded {chunk_path} for double dots/sharps/flats")
                double_symbol_errors += 1
                continue

            try:
                seq = labels.encode(krnseq)
            except Exception as e:
                logger.warning(f"Discarded {chunk_path} during label encoding."
                               f" Reason: {e}")
                encoding_errors += 1
                continue

            seqlen = labels.ctclen(seq)

            krnseq_path = chunk_path.with_suffix('.krnseq')
            krnseq_path.write_text(krnseq)

            seq_path = chunk_path.with_suffix('.seq')
            with seq_path.open(mode="wb") as f:
                f.write(pickle.dumps(seq))

            if duration > args.max_duration or \
                    duration < seqlen * args.min_duration_symbol:
                logger.warning(f"Sequence too long in {chunk_path} "
                               f"len={seqlen} duration={duration:.2f}")
                length_errors += 1
                continue

            samples.append([str(audio_path), str(seq_path), duration])

        invalid_counters['clean_errors'].append(clean_errors)
        invalid_counters['split_errors'].append(split_errors)
        invalid_counters['tiefix_errors'].append(tiefix_errors)
        invalid_counters['hum2mid_errors'].append(hum2mid_errors)
        invalid_counters['audio_errors'].append(audio_errors)
        invalid_counters['krnseq_errors'].append(krnseq_errors)
        invalid_counters['double_symbol_errors'].append(double_symbol_errors)
        invalid_counters['encoding_errors'].append(encoding_errors)
        invalid_counters['length_errors'].append(length_errors)
Exemplo n.º 6
0
 def get_signal(self, path):
     signal = load_audio(path)
     signal = Variable(signal, requires_grad=True)
     return signal
Exemplo n.º 7
0
from data.data_loader import load_audio, NoiseInjection

parser = argparse.ArgumentParser()
parser.add_argument('--input-path',
                    default='input.wav',
                    help='The input audio to inject noise into')
parser.add_argument('--noise-path',
                    default='noise.wav',
                    help='The noise file to mix in')
parser.add_argument('--output-path',
                    default='output.wav',
                    help='The noise file to mix in')
parser.add_argument('--sample-rate',
                    default=16000,
                    help='Sample rate to save output as')
parser.add_argument('--noise-level',
                    type=float,
                    default=1.0,
                    help='The Signal to Noise ratio (higher means more noise)')
args = parser.parse_args()

noise_injector = NoiseInjection()
data, sample_rate_ = load_audio(args.input_path)
assert sample_rate_ == args.sample_rate
mixed_data = noise_injector.inject_noise_sample(data, args.noise_path,
                                                args.noise_level)
mixed_data = torch.tensor(mixed_data,
                          dtype=torch.float).unsqueeze(1)  # Add channels dim
torchaudio.save(args.output_path, mixed_data, args.sample_rate)
print('Saved mixed file to %s' % args.output_path)
Exemplo n.º 8
0
def combine_datasets(data_dir, dataset_config, sample_rate=22050):
    logger = config_logger('combine_datasets', console_level='INFO')

    if not isinstance(data_dir, Path):
        data_dir = Path(data_dir)

    group = data_dir.name

    # Label file must be the same. Just make a copy and rename.
    labels_path = [
        i for i in data_dir.rglob(f'*{dataset_config}/labels*.json')
    ][0]
    outpath = data_dir / f'labels_{group}_{dataset_config}.json'
    logger.info(f'Saving encoder labels to {outpath}')
    outpath.write_text(labels_path.read_text())

    dataset_partitions = ['test', 'train', 'val']
    extension = 'csv'
    for dataset_partition in dataset_partitions:
        logger.info(
            f'Looking for partition {dataset_partition} from configuration {dataset_config}.'
        )
        all_filenames = [
            i for i in data_dir.rglob(
                f'*{dataset_config}/{dataset_partition}*.{extension}')
        ]
        logger.info(f'Found {len(all_filenames)} files.')

        # combine all files in the list
        combined_csv = pd.concat([
            pd.read_csv(f, header=None).assign(filename=f.name)
            for f in all_filenames
        ]).rename(columns={
            0: "audio",
            1: "seq"
        })
        logger.info(combined_csv.groupby('filename')['audio'].count())
        logger.info(
            f"Combined partition {dataset_partition}: {combined_csv['audio'].count()} samples."
        )

        durations = []
        total_duration = 0
        audio_errors = 0
        for audio_file in combined_csv['audio'].tolist():
            try:
                y = load_audio(str(audio_file))
                duration = len(y) / sample_rate
            except Exception as e:
                logger.exception(
                    f"Exception while loading {audio_file} audio. Reason: {e}")
                audio_errors += 1
                duration = 0
                continue
            durations.append(duration)

        combined_csv = combined_csv.assign(duration=durations)
        # SortaGrad
        combined_csv = combined_csv.sort_values(by='duration')

        total_duration = combined_csv['duration'].sum()
        logger.info(f'Total duration: {total_duration/60/60} hours.')
        logger.info(f'Found {audio_errors} errors during loading.')

        # export to csv
        outpath = data_dir / f'{dataset_partition}_{group}_{dataset_config}.{extension}'
        logger.info(f'Saving to {outpath}')
        combined_csv.drop(['filename', 'duration'],
                          axis=1).to_csv(outpath,
                                         index=False,
                                         header=False,
                                         encoding='iso-8859-1')
Exemplo n.º 9
0
def process_sample(q, samples, args, labels):
    while True:
        score_path = q.get()
        if score_path is None:
            break

        # Remove grace notes, ornaments, etc...
        kern = Kern(Path(args.data_dir) / score_path)
        kern.spines.override_instruments(args.instruments)
        try:
            if not kern.clean():
                print(f'Cannot clean kern {score_path}')
                continue
        except Exception as e:
            print(f"Exception while cleaning {score_path} audio. Reason: {e}")
            continue

        root_path = Path(args.out_dir) / score_path.parent
        root_path.mkdir(parents=True, exist_ok=True)

        krn_path = Path(args.out_dir) / score_path

        # Set seed to ensure same chunk sizes and tempo scaling
        np.random.seed(bytearray(score_path.name, 'utf-8'))

        try:
            kern_chunks = kern.split(args.chunk_sizes, args.train_stride)
        except Exception as e:
            print(f'Exception {e} while splitting {score_path}')
            continue

        # random scale between +ts and -ts
        ts = 1 + args.tempo_scaling * (2 * np.random.rand(len(kern_chunks)) -
                                       1)
        for i, kern in enumerate(kern_chunks):
            chunk_path = krn_path.with_suffix(f'.{i:03d}.krn')
            kern.save(chunk_path)

            # Fix ties with tiefix command
            process = subprocess.run(['tiefix', chunk_path],
                                     encoding='iso-8859-1',
                                     stdout=subprocess.PIPE)
            if (process.returncode != 0):
                print(f"tiefix error={process.returncode} on {chunk_path}")
                print(process.stdout)
                continue

            kern = Kern(data=process.stdout)
            kern.save(chunk_path)

            audio_path = chunk_path.with_suffix('.flac')

            if args.resynthesize or not audio_path.exists():
                mid_path = chunk_path.with_suffix('.mid')
                # Tempo and instrumment extracted from *MM and *I indications
                status = os.system(
                    f'hum2mid {str(chunk_path)} -C -v 100 -t {ts[i]} -o {str(mid_path)} >/dev/null 2>&1'
                )
                if (os.WEXITSTATUS(status) != 0):
                    print(f"hum2mid error={status} on {krn_path}")
                    continue

                status = os.system(
                    f'fluidsynth --sample-rate={args.sample_rate} -O s16 -T raw -i -l -F - {args.soundfont} {str(mid_path)} | '
                    f'ffmpeg -y -f s16le -ar {args.sample_rate} -ac 2 -i pipe: '
                    f'-ar {args.sample_rate} -ac 1 -ab {args.bit_rate} -strict -2 {str(audio_path)} 2>/dev/null'
                )

            try:
                y = load_audio(str(audio_path))
            except Exception as e:
                print(
                    f"Exception while loading {chunk_path} audio. Reason: {e}")
                continue

            duration = len(y) / args.sample_rate

            krnseq = kern.tosequence()

            if krnseq is None:
                #print(f"Discarded {chunk_path} for double dots/sharps/flats")
                continue

            try:
                seq = labels.encode(krnseq)
            except Exception as e:
                print(f"Discarded {chunk_path}. Reason: {e}")
                continue

            seqlen = labels.ctclen(seq)

            krnseq_path = chunk_path.with_suffix('.krnseq')
            krnseq_path.write_text(krnseq)

            seq_path = chunk_path.with_suffix('.seq')
            with seq_path.open(mode="wb") as f:
                f.write(pickle.dumps(seq))

            if duration > args.max_duration or duration < seqlen * args.min_duration_symbol:
                #print(f"Sequence too long in {chunk_path} len={seqlen} duration={duration:.2f}")
                continue

            samples.append([str(audio_path), str(seq_path), duration])