示例#1
0
 def _inner(ai: AudioTensor) -> AudioTensor:
     "Split signal at points of silence greater than 2*pad_ms"
     if remove_type is None: return ai
     padding = int(pad_ms / 1000 * ai.sr)
     if (padding > ai.nsamples): return ai
     splits = split(ai.numpy(), top_db=threshold, hop_length=padding)
     if remove_type == "split":
         sig = [
             ai[:, (max(a - padding, 0)):(min(b + padding, ai.nsamples))]
             for (a, b) in _merge_splits(splits, padding)
         ]
     elif remove_type == "trim":
         sig = [
             ai[:,
                (max(splits[0, 0] - padding, 0)):splits[-1, -1] + padding]
         ]
     elif remove_type == "all":
         sig = [
             torch.cat([
                 ai[:,
                    (max(a - padding, 0)):(min(b + padding, ai.nsamples))]
                 for (a, b) in _merge_splits(splits, padding)
             ],
                       dim=1)
         ]
     else:
         raise ValueError(
             f"Valid options for silence removal are None, 'split', 'trim', 'all' not '{remove_type}'."
         )
     ai.data = torch.cat(sig, dim=-1)
     return ai
示例#2
0
def tfm_remove_silence(signal, rate, remove_type, threshold=20, pad_ms=200):
    '''Split signal at points of silence greater than 2*pad_ms '''
    actual = signal.clone().squeeze()
    padding = int(pad_ms / 1000 * rate)
    if (padding > len(actual)): return [actual]
    splits = split(actual.numpy(), top_db=threshold, hop_length=padding)
    if remove_type == "split":
        return [
            actual[(max(a - padding, 0)):(min(b + padding, len(actual)))]
            for (a, b) in splits
        ]
    elif remove_type == "trim":
        return [
            actual[(max(splits[0, 0] - padding, 0)):splits[-1, -1] +
                   padding].unsqueeze(0)
        ]
    elif remove_type == "all":
        return [
            torch.cat([
                actual[(max(a - padding, 0)):(min(b + padding, len(actual)))]
                for (a, b) in splits
            ])
        ]
    else:
        raise ValueError(
            f"Valid options for silence removal are None, 'split', 'trim', 'all' not {remove_type}."
        )
示例#3
0
def main_():
    ap = argparse.ArgumentParser()
    ap.add_argument("--hop_size", type=int, default=110)
    ap.add_argument("--fft_size", type=int, default=2048)
    ap.add_argument("wav_list", type=str)
    ap.add_argument("ali_rspecifier", type=str)

    args = ap.parse_args()

    wavlist_fp = open(args.wav_list,
                      mode='rt') if args.wav_list != "-" else sys.stdin

    with kaldi_io.open_or_fd(args.ali_rspecifier, mode="wb") as ali_writer:
        for line in wavlist_fp:
            s = line.strip().replace("\t", " ").split(" ")
            assert len(s) == 2
            utt, wav_file = s

            rate, wav = read(wav_file)
            wav = wav.astype("float")

            voiced = np.zeros_like(wav, dtype=np.int32)
            ans = split(wav,
                        frame_length=args.fft_size,
                        hop_length=args.hop_size)
            for a in ans:
                sidx, eidx = a
                voiced[sidx:eidx] = 1
            kaldi_io.write_vec_int(ali_writer, voiced, key=utt)

    wavlist_fp.close()
示例#4
0
def tfm_chop_silence(signal, rate, threshold=20, pad_ms=200):
    '''Split signal at points of silence greater than 2*pad_ms '''
    actual = signal.clone().squeeze()
    padding = int(pad_ms/1000*rate)
    if(padding > len(actual)): return [actual]
    splits = split(actual.numpy(), top_db=threshold, hop_length=padding)
    return [actual[(max(a,0) - padding):(b + min(padding,len(actual)))] for (a, b) in splits]
示例#5
0
 def _inner(ai: AudioItem) -> AudioItem:
     '''Split signal at points of silence greater than 2*pad_ms '''
     if remove_type is None: return ai
     padding = int(pad_ms / 1000 * ai.sr)
     if (padding > ai.nsamples): return ai
     actual = ai.sig.clone()
     splits = split(actual.numpy(), top_db=threshold, hop_length=padding)
     if remove_type == "split":
         sig = [
             actual[:,
                    (max(a - padding, 0)):(min(b + padding, ai.nsamples))]
             for (a, b) in _merge_splits(splits, padding)
         ]
     elif remove_type == "trim":
         sig = [
             actual[:, (max(splits[0, 0] - padding, 0)):splits[-1, -1] +
                    padding]
         ]
     elif remove_type == "all":
         sig = [
             torch.cat([
                 actual[:,
                        (max(a - padding, 0)):(min(b +
                                                   padding, ai.nsamples))]
                 for (a, b) in _merge_splits(splits, padding)
             ],
                       dim=1)
         ]
     else:
         raise ValueError(
             f"Valid options for silence removal are None, 'split', 'trim', 'all' not '{remove_type}'."
         )
     return AudioItem((*sig, ai.sr, ai.path))
示例#6
0
def trim_pauses_and_split_into_chunks(audio,
                                      top_db=30,
                                      chunk_duration=4,
                                      sampling_rate=22050):
    """
    Trims all the pauses in the file and cut it into chunks of a given sample_duration.
    For example, if a 50 seconds file is given then it will remove silence first (40 seconds remaining)
    and then will return 10 arrays with each representing a 4 second sample.
    All chunks are cut fully coherently so their stacking gives the trimmed file.
    :param audio: numpy array representing an audio signal
    :param top_db: The threshold (in decibels) below reference to consider as silence
    :param chunk_duration: cut into chunks of 'sample_duration' length
    :param sampling_rate: number of discrete samples in every second, not be confused with sample size we cut the audio into
    :return: list of np.ndarray [shape=(m, chunk_duration*sampling rate)]. m = ceil(len(audio) / chunk_duration)
    """
    intervals = effects.split(audio, top_db=top_db)

    # trim pauses
    trimmed_audio = np.empty(0)
    for i in range(len(intervals)):
        trimmed_audio = np.concatenate(
            (trimmed_audio, audio[intervals[i][0]:intervals[i][1]]))

    # split into little chunks
    chunks = []
    step = chunk_duration * sampling_rate
    for i in range(0, len(trimmed_audio), step):
        chunks.append(audio[i:i + step])

    return chunks
示例#7
0
 def encodes(self, ai: AudioTensor) -> AudioTensor:
     if self.remove_type is None:
         return ai
     padding = int(self.pad_ms / 1000 * ai.sr)
     if padding > ai.nsamples:
         return ai
     splits = split(ai.numpy(), top_db=self.threshold, hop_length=padding)
     if self.remove_type == RemoveType.Split:
         sig = [
             ai[:, (max(a - padding, 0)):(min(b + padding, ai.nsamples))]
             for (a, b) in _merge_splits(splits, padding)
         ]
     elif self.remove_type == RemoveType.Trim:
         sig = [
             ai[:,
                (max(splits[0, 0] - padding, 0)):splits[-1, -1] + padding]
         ]
     elif self.remove_type == RemoveType.All:
         sig = [
             torch.cat(
                 [
                     ai[:,
                        (max(a - padding, 0)):(min(b +
                                                   padding, ai.nsamples))]
                     for (a, b) in _merge_splits(splits, padding)
                 ],
                 dim=1,
             )
         ]
     else:
         raise ValueError(f"""Valid options for silence removal are
             None, RemoveType.Split, RemoveType.Trim, RemoveType.All,
             but not '{self.remove_type}'.""")
     ai.data = torch.cat(sig, dim=-1)
     return ai
示例#8
0
def clear_from_silence(wave):
    sounded_ints = effects.split(wave,
                                 top_db=top_decibells,
                                 frame_length=win_len,
                                 hop_length=hop_len)
    sounded_wave = [wave[inter[0]:inter[1]] for inter in sounded_ints]
    return np.concatenate(sounded_wave)
def main_():
    ap = argparse.ArgumentParser()
    ap.add_argument("--hop_size", type=int, default=110)
    ap.add_argument("--fft_size", type=int, default=2048)
    ap.add_argument("wav_list", type=str)
    ap.add_argument("wav_outdir", type=str)

    args = ap.parse_args()

    wavlist_fp = open(args.wav_list,
                      mode='rt') if args.wav_list != "-" else sys.stdin

    for line in wavlist_fp:
        s = line.strip().replace("\t", " ").split(" ")
        assert len(s) == 2
        utt, wav_file = s

        rate, wav = read(wav_file)
        wav = wav.astype("float")

        ans = split(wav, frame_length=args.fft_size, hop_length=args.hop_size)
        voiced_wav = [wav[a[0]:a[1]] for a in ans]
        voiced_wav = np.concatenate(voiced_wav).astype(np.int16)
        write(join(args.wav_outdir, "{}.wav".format(utt)), rate, voiced_wav)

    wavlist_fp.close()
def split_sound_files(sound_waves, sound_types):

    test_sound = np.float64(sound_waves[0][0][1])
    splits = split(test_sound, 10, frame_length=10)

    [
        sci_wav.write(
            "/export/home/amatskev/testlibrosa/second_test{}.wav".format(idx),
            16000, np.uint8(test_sound[splits[idx][0]:splits[idx][1]]))
        for idx, intervall in enumerate(splits)
    ]

    print(1 + 2)
    print(1 + 2)
示例#11
0
    def truncate_silence(self, signal):

        nonsilent_indices = split(y=signal, \
                                  top_db=self.top_db, \
                                  frame_length=self.frame_length, \
                                  hop_length=self.frame_skip)

        # Only keep nonsilent intervals of signal
        signal_intervals = []
        for index in nonsilent_indices:
            signal_interval = signal[index[0]:index[1]]
            signal_intervals.append(signal_interval)

        # Return flattened array
        return np.concatenate(signal_intervals, axis=0)
示例#12
0
def tfm_trim_silence(signal, rate, threshold=20, pad_ms=200):
    '''Remove silence from start and end of audio'''
    actual = signal.clone().squeeze()
    padding = int(pad_ms/1000*rate)
    splits = split(actual.numpy(), top_db=threshold)
    return actual[splits[0, 0]-padding:splits[-1, -1]+padding].unsqueeze(0)
示例#13
0
def preprocess_track(track_id):
    iad_path = os.path.join(source_path, "%s_SOURCEID.lab" % track_id)

    # the directory looks like "J_Q_T_reduced"
    spec_path = os.path.join(
        homedir, "MedleyDB/processed/%d_%d_%d_reduced_new/" % (J, Q, T))

    input_path = os.path.join(spec_path, "input")
    label_path = os.path.join(spec_path, "labels")

    if not (os.path.exists(input_path)):
        os.makedirs(input_path)
    if not (os.path.exists(label_path)):
        os.makedirs(label_path)

    # load audio list

    # y, _ = librosa.load(
    #     os.path.join(audiodir, audio_file), sr=sr, res_type="kaiser_fast"
    # )

    meta_file = ("/home/laura/medleydb/medleydb/data/Metadata/" + track_id +
                 "_METADATA.yaml")

    with open(meta_file) as f:
        data = yaml.load(f)

    stems_files = [
        stem["filename"] for stem in data["stems"].values()
        if stem["instrument"] in MEDLEYDB_INSTRUMENTS
    ]

    if stems_files == []:
        return track_id

    y = np.mean(
        np.stack([
            librosa.load(
                os.path.join(medleydir, track_id, track_id + "_STEMS",
                             audio_file),
                sr=sr,
                res_type="kaiser_fast",
            )[0] for audio_file in stems_files
        ], ),
        axis=0,
    )

    # get the non-silent intervals
    intervals = split(y,
                      top_db=10,
                      frame_length=samples_per_snippet,
                      hop_length=samples_per_snippet)

    for start_i, end_i in intervals:
        for i in tqdm(range(start_i, end_i, samples_per_snippet), unit="clip"):
            sound_bite = y[i:i + samples_per_snippet]
            # get all sound_bites except the last one (which is not 6s long)
            if len(sound_bite) == samples_per_snippet:
                sound_bite = preprocess(sound_bite)

                S_dict = get_scattering_coefficients(sound_bite,
                                                     order1_indices,
                                                     order2_indices,
                                                     scattering.forward)

                label = preprocess_label(iad_path, i, i + samples_per_snippet)

                np.save(
                    os.path.join(input_path,
                                 "%s_%d.npy" % (track_id, int(i / sr))),
                    S_dict,
                )
                np.save(
                    os.path.join(label_path,
                                 "%s_%d.npy" % (track_id, int(i / sr))),
                    label,
                )

    return track_id
def trim(signal, top_db=20):
    from librosa.effects import split

    intervals = split(signal, top_db=20)

    return signal[intervals[0][0]:intervals[-1][-1]]
示例#15
0
def extract_log_mel_feats(set_type, path_to_csv, path_to_files, out_path, sr,
                          fft_size, hop, n_mels):
    """
    Extract features from given files and store them in binary format.

    :param set_type:
    :param path_to_csv: path to loaded csv
    :param path_to_files: path to loaded data
    :param out_path: path to store extracted features
    :param sr: input files sample rate
    :param fft_size: size of fft window
    :param hop: hop size
    :param n_mels: number of mel band

    :return:

    """
    set_type = set_type.lower()
    if set_type not in ['train', 'test']:
        raise Exception('Such set type not supported: {}'.format(set_type))

    feats = []

    if set_type == 'train':
        meta = pd.read_csv(path_to_csv, sep='\t', header=None)
        meta.columns = ['file', 'unk1', 'unk2', 'duration', 'type']

        file_names = list(meta['file'])
        n_files = len(file_names)
        labels = list(meta['type'])

        uniq_labels = np.sort(np.unique(labels))
        label_to_id = {label: i for i, label in enumerate(uniq_labels)}

        print('Total files:', n_files)

        for i, (file_name, label) in tqdm(enumerate(zip(file_names, labels))):
            wav_data, sr = load_wav(os.path.join(path_to_files, file_name),
                                    sr=sr)
            for part in split(wav_data, top_db=30):
                start, end = part
                # skip ultra short parts
                if (end - start) < fft_size:
                    continue
                wav_part = wav_data[start:end]
                mel_spec = melspectrogram(wav_part,
                                          n_fft=fft_size,
                                          hop_length=hop,
                                          n_mels=n_mels,
                                          fmax=sr // 2)
                log_mel_spec = power_to_db(mel_spec, ref=np.max)
                feats.append({
                    'fname': file_name,
                    'feature': log_mel_spec,
                    'label_id': label_to_id[label]
                })
        pickle.dump(feats, open(out_path, 'wb'))
        return label_to_id
    else:
        for i, file_name in tqdm(enumerate(os.listdir(path_to_files))):
            wav_data, sr = load_wav(os.path.join(path_to_files, file_name),
                                    sr=sr)
            if len(wav_data) == 0:
                # print('Empty file:', file_name)
                wav_data = np.zeros(sr)
            mel_spec = melspectrogram(wav_data,
                                      n_fft=fft_size,
                                      n_mels=n_mels,
                                      fmax=sr // 2)
            log_mel_spec = power_to_db(mel_spec, ref=np.max)
            feats.append({
                'fname': file_name,
                'feature': log_mel_spec,
            })

    pickle.dump(feats, open(out_path, 'wb'))