def __init__(self, roots, sample_rate, max_time, target_level=-25, noise_proportion=0, noise_type='gaussian', snrs=[3], eps=1e-8, **kwargs): self.sample_rate = sample_rate self.max_time = max_time self.target_level = target_level self.eps = eps self.filepths = [] for root in roots: self.filepths += find_files(root) assert len(self.filepths) > 0, 'No audio file detected' self.noise_proportion = noise_proportion self.snrs = snrs if noise_type == 'gaussian': self.noise_sampler = torch.distributions.Normal(0, 1) else: self.noise_wavpths = find_files(noise_type)
def SaveSTFT_Arg(pitch_shift, time_stretch, argtime): targetlist = find_files(C.target_path, ext="wav") noiselist = find_files(C.noise_path, ext="wav") noise_num = len(noiselist) target_index = 0 for targetfile in tqdm(targetlist): target_mag, _ = LoadAudio_Arg(targetfile) norm = target_mag.max() skip_count = 0 if target_mag.shape[0] > C.PATCH_LENGTH: step = target_mag.shape[0] // C.PATCH_LENGTH for i in tqdm(range(step), leave=False): target_mag_p = target_mag[:, i * C.PATCH_LENGTH:(i + 1) * C.PATCH_LENGTH] target_mag_p /= norm noise_file = noiselist[random.randint(0, noise_num - 1)] noise_mag, _ = LoadAudio(noise_file) noise_mag = LengthAdjuster(noise_mag) noise_mag /= norm addnoise_mag = target_mag_p + noise_mag addnoise_mag /= norm fname = str(target_index) + "_" + str(i) + str(argtime) np.savez(os.path.join(C.PATH_FFT, fname + "_arg.npz"), speech=target_mag_p, addnoise=addnoise_mag) else: skip_count += 1 print("SKIP:", skip_count)
def get_ann_audio(directory): '''Get a list of annotations and audio files from a directory. This also validates that the lengths match and are paired properly. Parameters ---------- directory : str The directory to search Returns ------- pairs : list of tuples (audio_file, annotation_file) ''' audio = find_files(directory) annos = find_files(directory, ext=['jams', 'jamz']) paired = list(zip(audio, annos)) if len(audio) != len(annos) or any( [base(aud) != base(ann) for aud, ann in paired]): raise DataError( 'Unmatched audio/annotation data in {}'.format(directory)) return paired
def sample_wavs_and_dump_txt(root,dev_ids, numbers, meta_data_name): wav_list = [] count_positive = 0 for _ in range(numbers): prob = random.random() if (prob > 0.5): dev_id_pair = random.sample(dev_ids, 2) # sample 2 wavs from different speaker sample1 = "/".join(random.choice(find_files(os.path.join(root,dev_id_pair[0]))).split("/")[-3:]) sample2 = "/".join(random.choice(find_files(os.path.join(root,dev_id_pair[1]))).split("/")[-3:]) label = "0" wav_list.append(" ".join([label, sample1, sample2])) else: dev_id_pair = random.sample(dev_ids, 1) # sample 2 wavs from same speaker sample1 = "/".join(random.choice(find_files(os.path.join(root,dev_id_pair[0]))).split("/")[-3:]) sample2 = "/".join(random.choice(find_files(os.path.join(root,dev_id_pair[0]))).split("/")[-3:]) label = "1" count_positive +=1 wav_list.append(" ".join([label, sample1, sample2])) f = open(meta_data_name,"w") for data in wav_list: f.write(data+"\n") f.close() return wav_list
def makeCQTexOvertone(): TORIDASUBASHO = PATH_pwCQT HOZONBASYO = PATH_pwov2CQT # n倍音を保存するディレクトリ OVTONE = 2 audiolist = find_files(PATH_AUDIO, ext="wav") cqtlist = find_files(TORIDASUBASHO, ext="npy") itemlist = len(audiolist) i_counter = 1 for audiofile, cqtfile in zip(audiolist, cqtlist): print("{}/{}".format(i_counter, itemlist)) filename = audiofile.split('/')[-1] albname = audiofile.split('/')[-2] # foldname = audiofile.split('/')[-3] cqt_filename = cqtfile.split("/")[-1] if (filename.split(".")[0] != cqt_filename.split(".")[0]): print("file_not_match", filename, cqt_filename) # ディレクトリチェック if not (os.path.exists(HOZONBASYO + '/' + albname)): os.makedirs(HOZONBASYO + '/' + albname) if not (os.path.exists(HOZONBASYO + '/' + albname + '/' + filename + '.npy')): wav, sr = load(audiofile, sr=SR) cqt = np.load(cqtfile) excqt = exOvertone(cqt, wav, overtone=OVTONE) np.save(HOZONBASYO + '/' + albname + '/' + filename + '.npy', np.array(excqt, dtype="float32")) i_counter += 1
def TrainConvnetExtractorDeepChroma(trainidx, epoch=20, saveas="convnet.model"): cqtfilelist = np.array(find_files(const.PATH_HCQT, ext="npy"))[trainidx] labfilelist = np.array( find_files(const.PATH_CHORDLAB, ext=["lab", "chords"]))[trainidx] #midifilelist = find_files(const.PATH_MIDI,ext="mid")[:filecnt] config.train = True config.enable_backprop = True convnet = networks.ConvnetFeatExtractor() model = networks.ConvnetPredictor(convnet) model.to_gpu(0) opt = optimizers.MomentumSGD() opt.setup(model) print("DeepChroma Convnet Training...") print("start epochs...") S = [] T = [] for cqtfile, labfile in zip(cqtfilelist, labfilelist): cqt = np.load(cqtfile) spec = utils.PreprocessSpec(cqt[:const.CQT_H, :, :]) targ = voc.LoadChromaTarget(labfile) minlen = min([cqt.shape[1], targ.shape[0]]) S.append(spec[:, :minlen, :]) T.append(targ[:minlen, :]) S = np.concatenate(S, axis=1) T = np.concatenate(T, axis=0) assert (S.shape[1] == T.shape[0]) for ep in range(epoch): sum_loss = 0 randidx = np.random.randint(0, S.shape[1] - const.CONV_TRAIN_SEQLEN - 1, S.shape[1] // const.CONV_TRAIN_SEQLEN * 4) for i in range(0, randidx.size - const.CONV_TRAIN_BATCH, const.CONV_TRAIN_BATCH): x_batch = np.stack([ S[:, randidx[j]:randidx[j] + const.CONV_TRAIN_SEQLEN, :] for j in range(i, i + const.CONV_TRAIN_BATCH) ]) t_batch = np.stack([ T[randidx[j]:randidx[j] + const.CONV_TRAIN_SEQLEN, :] for j in range(i, i + const.CONV_TRAIN_BATCH) ]) x_in = cp.asarray(x_batch) t_in = cp.asarray(t_batch) model.cleargrads() loss = model(x_in, t_in) loss.backward() opt.update() sum_loss += loss.data convnet.save(saveas) print("epoch: %d/%d loss:%.04f" % (ep + 1, epoch, sum_loss / const.CONV_TRAIN_BATCH)) convnet.save(saveas)
def EvaluateChord(idx, verbose=True, sonify=False, cross=False): lablist = np.array(find_files(const.PATH_CHORDLAB, ext=["lab", "chords"]))[idx] est_lablist = np.array(find_files( const.PATH_ESTIMATE_CROSS, ext="lab"))[idx] if cross else find_files( const.PATH_ESTIMATE, ext="lab") scorelist_majmin = np.array([]) scorelist_sevenths = np.array([]) scorelist_majmininv = np.array([]) scorelist_seventhinv = np.array([]) durations = np.array([]) confmatrix = np.zeros((const.N_CHORDS, const.N_CHORDS)) song_durations = np.array([]) for labfile, estfile in zip(lablist, est_lablist): (ref_intervals, ref_labels) = mir_eval.io.load_labeled_intervals(labfile) (est_intervals, est_labels) = mir_eval.io.load_labeled_intervals(estfile) est_intervals, est_labels = mir_eval.util.adjust_intervals( est_intervals, est_labels, ref_intervals.min(), ref_intervals.max(), mir_eval.chord.NO_CHORD, mir_eval.chord.NO_CHORD) (intervals, ref_labels, est_labels) = mir_eval.util.merge_labeled_intervals( ref_intervals, ref_labels, est_intervals, est_labels) durations = mir_eval.util.intervals_to_durations(intervals) comparisons_sevenths = mir_eval.chord.sevenths(ref_labels, est_labels) comparisons_majmininv = mir_eval.chord.majmin_inv( ref_labels, est_labels) comparisons_seventhinv = mir_eval.chord.sevenths_inv( ref_labels, est_labels) comparisons_majmin = mir_eval.chord.majmin(ref_labels, est_labels) score_majmin = mir_eval.chord.weighted_accuracy( comparisons_majmin, durations) scorelist_majmin = np.append(scorelist_majmin, score_majmin) score_sevenths = mir_eval.chord.weighted_accuracy( comparisons_sevenths, durations) scorelist_sevenths = np.append(scorelist_sevenths, score_sevenths) score_majmininv = mir_eval.chord.weighted_accuracy( comparisons_majmininv, durations) scorelist_majmininv = np.append(scorelist_majmininv, score_majmininv) score_seventhinv = mir_eval.chord.weighted_accuracy( comparisons_seventhinv, durations) scorelist_seventhinv = np.append(scorelist_seventhinv, score_seventhinv) if verbose: print("%s --- %.3f" % (labfile.split('/')[-1], score_majmin)) for i in range(len(ref_labels)): confmatrix[voc.GetChordIDSign(ref_labels[i]), voc.GetChordIDSign(est_labels[i])] += durations[i] song_durations = np.append(song_durations, np.sum(durations)) return scorelist_majmin, scorelist_sevenths, scorelist_majmininv, scorelist_seventhinv, confmatrix, song_durations
def __init__(self,idx,rand_shift=False): self.list_labfile = np.array(find_files(C.PATH_CHORDLAB,ext=["lab","chords"]))[idx] #self.list_cqtfile = np.array(find_files(C.PATH_CQT,ext="npy"))[idx] self.list_featfile = np.array(find_files(C.PATH_FEAT,ext="npy"))[idx] #self.list_mfccfile = np.array(find_files(C.PATH_MFCC,ext="npy"))[idx] #self.list_cqt = [U.normalize_spec(np.load(f)) for f in self.list_cqtfile] self.list_feat = [np.load(f)[:,:C.N_DIMS_FEAT] for f in self.list_featfile] self.labs_list,self.lab_intervals_list = LoadLabelSet(self.list_labfile,C.label_shifts[idx]) #self.list_mfcc = [np.load(f).T for f in self.list_mfccfile] self.rand_shift = rand_shift
def __init__(self, root): seed = random.randint(1, 1000) random.seed(seed) wav_root = Path(root) / "wav48" wav_files = [] metadata = defaultdict(list) speaker_dirs = [ speaker_dir for speaker_dir in wav_root.iterdir() if speaker_dir.is_dir() ] for speaker_dir in speaker_dirs: if speaker_dir.stem in self._except_folder: continue for wav_file in find_files(speaker_dir): wav_file = str(PurePosixPath(wav_file).relative_to(root)) wav_files.append(wav_file) speaker_id = self.get_speaker(wav_file) metadata[speaker_id].append(wav_file) self.root = root self.seed = seed self.wav_files = wav_files self.metadata = metadata
def CCMixter(): ''' mix : original wav file source_1 : inst wav file source_2 : vocal wac file ''' Audiolist = os.listdir('./data') spec_dir = "./Spectrogram" if os.path.exists(spec_dir) is False: os.mkdir(spec_dir) for audio in Audiolist: try: audio_path = os.path.join('./data/' + audio) print("Song : %s" % audio) if os.path.exists(os.path.join(spec_dir, audio + '.npz')): print("Already exist!! Skip....") continue aud = find_files(audio_path, ext="wav") mix, _ = load(aud[0], sr=None) inst, _ = load(aud[1], sr=None) vocal, _ = load(aud[2], sr=None) print("Saving...") SaveSpectrogram(mix, inst, vocal, audio) except IndexError as e: print("Wrong Directory") pass
def EstimateChord(idx, dnnmodel, todir=False): #dnn = networks.FeatureDNN() #dnn = networks.ConvnetFeatExtractor() dnn = networks.FullCNNFeatExtractor() #dnn = networks.NoOperation() dnn.load(dnnmodel) dnn.to_gpu(0) decoder = networks.NBLSTMCRF() decoder.load() decoder.to_gpu(0) cqtfilelist = np.array(find_files(const.PATH_HCQT, ext="npy"))[idx] i = 0 chainer.config.train = False chainer.config.enable_backprop = False for cqtfile in cqtfilelist: cqt = utils.Embed(utils.PreprocessSpec(np.load(cqtfile)[:, :, :]), 1) chroma = dnn.GetFeature(cp.asarray(cqt)).data path = decoder.argmax(chroma) feat = cp.asnumpy(chroma) if todir: fname = cqtfile.split("/")[-1] + ".lab" alb = cqtfile.split("/")[-2] utils.SaveEstimatedLabelsFramewise( path, const.PATH_ESTIMATE_CROSS + alb + "/" + fname, feat) else: utils.SaveEstimatedLabelsFramewise( path, const.PATH_ESTIMATE + "%03d.lab" % i, feat) i += 1
def visualize(data_dirs, wav2mel_path, checkpoint_path, output_path): """Visualize high-dimensional embeddings using t-SNE.""" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") wav2mel = torch.jit.load(wav2mel_path) dvector = torch.jit.load(checkpoint_path).eval().to(device) print("[INFO] model loaded.") n_spkrs = 0 paths, spkr_names, mels = [], [], [] for data_dir in data_dirs: data_dir_path = Path(data_dir) for spkr_dir in [x for x in data_dir_path.iterdir() if x.is_dir()]: n_spkrs += 1 audio_paths = find_files(spkr_dir) spkr_name = spkr_dir.name for audio_path in audio_paths: paths.append(audio_path) spkr_names.append(spkr_name) for audio_path in tqdm(paths, ncols=0, desc="Preprocess"): wav_tensor, sample_rate = torchaudio.load(audio_path) with torch.no_grad(): mel_tensor = wav2mel(wav_tensor, sample_rate) mels.append(mel_tensor) embs = [] for mel in tqdm(mels, ncols=0, desc="Embed"): with torch.no_grad(): emb = dvector.embed_utterance(mel.to(device)) emb = emb.detach().cpu().numpy() embs.append(emb) tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) transformed = tsne.fit_transform(embs) print("[INFO] embeddings transformed.") data = { "dim-1": transformed[:, 0], "dim-2": transformed[:, 1], "label": spkr_names, } plt.figure() sns.scatterplot( x="dim-1", y="dim-2", hue="label", palette=sns.color_palette(n_colors=n_spkrs), data=data, legend="full", ) plt.legend(loc="center left", bbox_to_anchor=(1, 0.5)) plt.tight_layout() plt.savefig(output_path)
def ConfMatrix_Allclass(idx): lab_list = np.array(find_files(C.PATH_CHORDLAB, ext=["lab", "chords"]))[idx] estimated_lab_list = [ os.path.join(C.PATH_ESTIMATE_CROSS, p.split("/")[-2], p.split("/")[-1]) for p in lab_list ] durations = np.array([]) confmatrix = np.zeros((C.N_VOCABULARY_TRIADS, C.N_VOCABULARY_TRIADS)) for labfile, estfile in zip(lab_list, estimated_lab_list): (ref_intervals, ref_labels) = mir_eval.io.load_labeled_intervals(labfile) (est_intervals, est_labels) = mir_eval.io.load_labeled_intervals(estfile) est_intervals, est_labels = mir_eval.util.adjust_intervals( est_intervals, est_labels, ref_intervals.min(), ref_intervals.max(), mir_eval.chord.NO_CHORD, mir_eval.chord.NO_CHORD) (intervals, ref_labels, est_labels) = mir_eval.util.merge_labeled_intervals( ref_intervals, ref_labels, est_intervals, est_labels) durations = mir_eval.util.intervals_to_durations(intervals) ref_labels_id = encode_chordseq_hierarchical(ref_labels) est_labels_id = encode_chordseq_hierarchical(est_labels) for i in range(len(ref_labels)): confmatrix[ref_labels_id[i, 0], est_labels_id[i, 0]] += durations[i] confmatrix /= np.sum(confmatrix, axis=1, keepdims=True) return confmatrix
def makeCQTData(): SAVEDIR = PATH_pwCQT audiolist = find_files(PATH_AUDIO, ext="wav") itemlist = len(audiolist) for i, audiofile in enumerate(audiolist): print("{}/{}".format(i + 1, itemlist)) wav, sr = load(audiofile, sr=SR) filename = audiofile.split('/')[-1] albname = audiofile.split('/')[-2] # foldname = audiofile.split('/')[-3] # ディレクトリチェック if not (os.path.exists(SAVEDIR + '/' + albname)): os.makedirs(SAVEDIR + '/' + albname) if not (os.path.exists(SAVEDIR + '/' + albname + '/' + filename + '.npy')): cqt_spec, freqs = cqt(wav, sr, fmin="C1") # cqt_power = np.abs(cqt_spec) # sqrtモードで行こう # 2019.12.22 cqt_power = np.array( [np.sqrt(c.real**2 + c.imag**2) for c in cqt_spec], dtype="float32") cqt_power = np.power(cqt_power, 2) # power 系1 cqt_power *= 2 # power 系2 cqt_power = cqt_power.reshape(1, cqt_power.shape[0], cqt_power.shape[1]) np.save(SAVEDIR + '/' + albname + '/' + filename + '.npy', np.array(cqt_power, dtype="float32"))
def load_npz(target=None, first=None): npz_files = find_files('../DSD100_Npz/Dev', ext="npz")[:first] # npz_files = find_files('../numpy', ext="npz")[:first] for file in npz_files: npz = np.load(file) assert (npz["mix"].shape == npz[target].shape) yield npz['mix'], npz[target]
def __init__(self, file_path, meta_data, max_timestep=None): self.roots = file_path self.root_key = list(self.roots.keys()) self.max_timestep = max_timestep # extract dev speaker and store in self.black_list_spealers with open(meta_data, "r") as f: self.black_list_speakers = f.read().splitlines() # calculate speakers and support to remove black list speaker (dev) self.all_speakers = \ [f.path for key in self.root_key for f in os.scandir(self.roots[key]) if f.is_dir()] self.speaker_num = len(self.all_speakers) self.necessary_dict = self.processing() self.label_mapping_spk_id = {} # speaker id map to speaker num self.build_label_mapping() print("search all wavs paths") start = time.time() self.dataset = [] for speaker in tqdm.tqdm(self.all_speakers): wav_list = find_files(speaker) self.dataset.extend(wav_list) end = time.time() print(f"search all wavs paths costs {end-start} seconds") self.label = self.build_label(self.dataset)
def __init__(self, root): seed = random.randint(1, 1000) random.seed(seed) if (Path(root) / "by_book").exists(): _root = Path(root) / "by_book" else: _root = Path(root) wav_files = [] metadata = defaultdict(list) speaker_dirs = [ speaker_dir for speaker_dir in (_root / "female").iterdir() if speaker_dir.is_dir() ] speaker_dirs += [ speaker_dir for speaker_dir in (_root / "male").iterdir() if speaker_dir.is_dir() ] for speaker_dir in speaker_dirs: for wav_file in find_files(speaker_dir): wav_file = str(PurePosixPath(wav_file).relative_to(root)) wav_files.append(wav_file) speaker_id = self.get_speaker(wav_file) metadata[speaker_id].append(wav_file) self.root = root self.seed = seed self.wav_files = wav_files self.metadata = metadata
def __init__(self, vad_config, key_list, file_path, meta_data, max_timestep=None): self.roots = file_path self.root_key = key_list self.max_timestep = max_timestep self.vad_c = vad_config self.dataset = [] self.all_speakers = [] for index in range(len(self.root_key)): cache_path = Path(os.path.dirname(__file__)) / 'cache_wav_paths' / f'cache_{self.root_key[index]}.p' p = Path(self.roots[index]) # loca cache_path if file exists if os.path.isfile(cache_path): # cache dict: # { # "speaker_id1": ["wav_a_path1", "wav_a_path2", ...], # "speaker_id2": ["wav_b_path1", "wav_b_path2", ...], # ..., # } cache_wavs_dict = pickle.load(open(cache_path,"rb")) self.all_speakers.extend(list(cache_wavs_dict.keys())) for speaker_id in list(cache_wavs_dict.keys()): for wavs in cache_wavs_dict[speaker_id]: utterance_id = "/".join(str(p/speaker_id/wavs).split("/")[-3:]).replace(".wav","").replace("/","-") self.dataset.append([str(p / speaker_id / wavs), utterance_id]) else: speaker_wav_dict = {} speaker_dirs = [f.path.split("/")[-1] for f in os.scandir(self.roots[index]) if f.is_dir()] self.all_speakers.extend(speaker_dirs) print("search all wavs paths") start = time.time() for speaker in tqdm.tqdm(speaker_dirs): speaker_dir = p / speaker wav_list=find_files(speaker_dir) speaker_wav_dict[speaker] = [] for wav in wav_list: wav_sample, _ = apply_effects_file(str(speaker_dir/wav), EFFECTS) wav_sample = wav_sample.squeeze(0) length = wav_sample.shape[0] if length > self.vad_c['min_sec']: utterance_id = "/".join(str(speaker_dir/wav).split("/")[-3:]).replace(".wav","").replace("/","-") self.dataset.append([str(speaker_dir/wav), utterance_id]) speaker_wav_dict[speaker].append("/".join(wav.split("/")[-2:])) end = time.time() print(f"search all wavs paths costs {end-start} seconds") print(f"save wav paths to {cache_path}! so we can directly load all_path in next time!") pickle.dump(speaker_wav_dict, open(cache_path,"wb")) self.speaker_num = len(self.all_speakers) self.necessary_dict = self.processing() self.label_mapping_spk_id = {} # speaker id map to speaker num self.build_label_mapping() self.label=self.build_label(self.dataset)
def main(data_dirs, out_dir, n_workers, audio_processor_path): """Preprocess audio files into features for training.""" audio_paths = chain.from_iterable([find_files(data_dir) for data_dir in data_dirs]) audio_processor_path = Path(audio_processor_path) / "audioprocessor" audio_processor_path = str(audio_processor_path).replace("/", ".") audioprocessor = getattr( importlib.import_module(audio_processor_path), "AudioProcessor" ) save_dir = Path(out_dir) save_dir.mkdir(parents=True, exist_ok=True) executor = ProcessPoolExecutor(max_workers=n_workers) futures = [] for audio_path in audio_paths: futures.append( executor.submit(load_process_save, audioprocessor, audio_path, save_dir) ) infos = { "sample_rate": audioprocessor.sample_rate, "hop_len": audioprocessor.hop_len, "n_mels": audioprocessor.n_mels, "utterances": [future.result() for future in tqdm(futures, ncols=0)], } with open(save_dir / "metadata.json", "w") as f: json.dump(infos, f, indent=2)
def SearchErrorExample(idx, ref, est): lab_list = np.array(find_files(C.PATH_CHORDLAB, ext=["lab", "chords"]))[idx] estimated_lab_list = [ os.path.join(C.PATH_ESTIMATE_CROSS, p.split("/")[-2], p.split("/")[-1]) for p in lab_list ] list_examples = [] for labfile, estfile in zip(lab_list, estimated_lab_list): (ref_intervals, ref_labels) = mir_eval.io.load_labeled_intervals(labfile) (est_intervals, est_labels) = mir_eval.io.load_labeled_intervals(estfile) est_intervals, est_labels = mir_eval.util.adjust_intervals( est_intervals, est_labels, ref_intervals.min(), ref_intervals.max(), mir_eval.chord.NO_CHORD, mir_eval.chord.NO_CHORD) (intervals, ref_labels, est_labels) = mir_eval.util.merge_labeled_intervals( ref_intervals, ref_labels, est_intervals, est_labels) durations = mir_eval.util.intervals_to_durations(intervals) ref_labels_id = encode_chordseq_hierarchical(ref_labels) est_labels_id = encode_chordseq_hierarchical(est_labels) for i in range(len(ref_labels)): if durations[i] > 0.5 and ref_labels_id[ i, 0] == ref and est_labels_id[i, 0] == est: list_examples.append((labfile, ref, est, intervals[i])) return list_examples
def LoadSpectrogram_batch(batch_size=10) : filelist = find_files('./Spectrogram', ext="npz") batch_list = [] for i in range(0, len(filelist), batch_size): #print(i) file_list = filelist[i : i + batch_size] batch_list.append(file_list) return batch_list
def separate(PATH_INPUT, PATH_OUTPUT, MODEL, SR=16000, FFT_SIZE = 1024, H = 512): if os.path.isdir( PATH_INPUT): # 入力がディレクトリーの場合、ファイルリストをつくる filelist_mixdown = find_files(PATH_INPUT, ext="wav", case_sensitive=True) else: # 入力が単一ファイルの場合 filelist_mixdown=[PATH_INPUT] print ('number of mixdown file', len(filelist_mixdown)) # 出力用のディレクトリーがない場合は 作成する。 _, path_output_ext = os.path.splitext(PATH_OUTPUT) print ('path_output_ext',path_output_ext) if len(path_output_ext)==0 and not os.path.exists(PATH_OUTPUT): os.mkdir(PATH_OUTPUT) # モデルの読み込み unet = train.UNet() chainer.serializers.load_npz( MODEL,unet) config.train = False config.enable_backprop = False # ミックスされたものを読み込み、vocal(speech)の分離を試みる for fmixdown in filelist_mixdown: # audioread でエラーが発生した場合は、scipyを使う。 try: y_mixdown, _ = load(fmixdown, sr=SR, mono=True) except: sr_mixdown, y_mixdown = read(fmixdown) if not sr_mixdown == SR: y_mixdown = resample(y_mixdown, sr_mixdown, SR) # 入力の短時間スペクトラムを計算して、正規化する。 spec = stft(y_mixdown, n_fft=FFT_SIZE, hop_length=H, win_length=FFT_SIZE) mag = np.abs(spec) mag /= np.max(mag) phase = np.exp(1.j*np.angle(spec)) print ('mag.shape', mag.shape) start = 0 end = 128 * (mag.shape[1] // 128) # 入力のフレーム数以下で、networkの定義に依存して 適切な値を選ぶこと。 # speech(vocal)を分離するためのマスクを求める mask = unet(mag[:, start:end][np.newaxis, np.newaxis, 1:, :]).data[0, 0, :, :] mask = np.vstack((np.zeros(mask.shape[1], dtype="float32"), mask)) # 入力の短時間スペクトラムにマスクを掛けて、逆FFTで波形を合成する。 mag2=mag[:, start:end]*mask phase2=phase[:, start:end] y = istft(mag2*phase2, hop_length=H, win_length=FFT_SIZE) # 分離した speech(vocal)を出力ファイルとして保存する。 if len(path_output_ext)==0: # ディレクトリーへ出力 foutname, _ = os.path.splitext( os.path.basename(fmixdown) ) fname= os.path.join(PATH_OUTPUT, (foutname + '.wav')) else: # 指定されたファイルへ出力 fname= PATH_OUTPUT print ('saving... ', fname) write_wav(fname, y, SR, norm=True)
def __init__(self, file_path, meta_data, max_timestep=None): self.roots = file_path self.root_key = list(self.roots.keys()) self.max_timestep = max_timestep self.dataset = [] self.all_speakers = [] for key in self.root_key: cache_path = f"./downstream/voxceleb2_amsoftmax/cache_wav_paths/cache_{key}.p" p = Path(self.roots[key]) # loca cache_path if file exists if os.path.isfile(cache_path): # cache dict = #{"speaker_id1":["wav_a_path1","wav_a_path2",...],"speaker_id2":["wav_b_path1", "wav_b_path2", ....],...} cache_wavs_dict = pickle.load(open(cache_path, "rb")) self.all_speakers.extend(list(cache_wavs_dict.keys())) for speaker_id in list(cache_wavs_dict.keys()): for wavs in cache_wavs_dict[speaker_id]: self.dataset.append(str(p / speaker_id / wavs)) else: speaker_wav_dict = {} # calculate speakers and support to remove black list speaker (dev) speaker_dirs = [ f.path.split("/")[-1] for f in os.scandir(self.roots[key]) if f.is_dir() ] self.all_speakers.extend(speaker_dirs) print("search all wavs paths") start = time.time() for speaker in tqdm.tqdm(speaker_dirs): speaker_dir = p / speaker wav_list = find_files(speaker_dir) speaker_wav_dict[speaker] = [] for wav in wav_list: self.dataset.append(str(speaker_dir / wav)) speaker_wav_dict[speaker].append("/".join( wav.split("/")[-2:])) end = time.time() print(f"search all wavs paths costs {end-start} seconds") print( f"save wav paths to {cache_path}! so we can directly load all_path in next time!" ) pickle.dump(speaker_wav_dict, open(cache_path, "wb")) self.speaker_num = len(self.all_speakers) self.necessary_dict = self.processing() self.label_mapping_spk_id = {} # speaker id map to speaker num self.build_label_mapping() self.label = self.build_label(self.dataset)
def TrainConvnetExtractor(trainidx, epoch=20, saveas="convnet.model"): cqtfilelist = np.array(find_files(const.PATH_MIDIHCQT, ext="npz"))[trainidx] #midifilelist = find_files(const.PATH_MIDI,ext="mid")[:filecnt] config.train = True config.enable_backprop = True convnet = networks.FullCNNFeatExtractor() model = networks.ConvnetPredictor(convnet) model.to_gpu(0) opt = optimizers.AdaDelta() opt.setup(model) print("train set length: %d" % trainidx.size) print("start epochs...") S = [] T = [] for cqtfile in cqtfilelist: dat = np.load(cqtfile) spec = utils.PreprocessSpec(dat["spec"])[:const.CQT_H, :, :] targ = GetConvnetTargetFromPianoroll(dat["target"]).astype(np.int32) assert (spec.shape[1] == targ.shape[0]) S.append(spec) T.append(targ) S = np.concatenate(S, axis=1) T = np.concatenate(T, axis=0) for ep in range(epoch): sum_loss = 0 assert (S.shape[1] == T.shape[0]) randidx = np.random.randint(0, S.shape[1] - const.CONV_TRAIN_SEQLEN - 1, S.shape[1] // const.CONV_TRAIN_SEQLEN * 4) for i in range(0, randidx.size - const.CONV_TRAIN_BATCH, const.CONV_TRAIN_BATCH): x_batch = np.stack([ S[:, randidx[j]:randidx[j] + const.CONV_TRAIN_SEQLEN, :] for j in range(i, i + const.CONV_TRAIN_BATCH) ]) t_batch = np.stack([ T[randidx[j]:randidx[j] + const.CONV_TRAIN_SEQLEN, :] for j in range(i, i + const.CONV_TRAIN_BATCH) ]) x_in = cp.asarray(x_batch) t_in = cp.asarray(t_batch) model.cleargrads() loss = model(x_in, t_in) loss.backward() opt.update() sum_loss += loss.data convnet.save(saveas) print("epoch: %d/%d loss:%.04f" % (ep + 1, epoch, sum_loss / const.CONV_TRAIN_BATCH)) convnet.save(saveas)
def getNoteTemplates(path_notes): list_templates = [] list_noteaudio = find_files(path_notes,ext="wav") for noteaudio in list_noteaudio: S_mag = U.LoadAudio(noteaudio) init_H = np.ones((1,S_mag.shape[1])) template,activate = NMF.nmf_sklearn(S_mag,k=1,H=init_H,verbose=False) list_templates.append(template[:,0]/np.max(template)) templates = np.stack(list_templates) return templates
def reload_data(path_to_features, part): matfiles = find_files(path_to_mat + part + '/', ext='mat') for i in range(len(matfiles)): if matfiles[i][len(path_to_mat) + len(part) + 1:].startswith('LFCC'): key = matfiles[i][len(path_to_mat) + len(part) + 6:-4] lfcc = sio.loadmat(matfiles[i], verify_compressed_data_integrity=False)['x'] with open(path_to_features + part + '/' + key + 'LFCC.pkl', 'wb') as handle2: pickle.dump(lfcc, handle2, protocol=pickle.HIGHEST_PROTOCOL)
def get_filenames(dir): files = find_files(dir, ext='npy') filenames = [] for f in files: f = os.path.basename(f)[:-4] filenames.append(f) return filenames
def shuffleCQT_baka(): TORIDASUBASHO = PATH_hrCQT HOZONBASYO = PATH_SAME2 HOZONBASYO2 = PATH_SAME3 OVTONE = 2 audiolist = find_files(PATH_AUDIO, ext="wav") cqtlist = find_files(TORIDASUBASHO, ext="npy") itemlist = len(audiolist) i_counter = 1 for audiofile, cqtfile in zip(audiolist, cqtlist): print("{}/{}".format(i_counter, itemlist)) filename = audiofile.split('/')[-1] albname = audiofile.split('/')[-2] # foldname = audiofile.split('/')[-3] cqt_filename = cqtfile.split("/")[-1] if (filename.split(".")[0] != cqt_filename.split(".")[0]): print("file_not_match", filename, cqt_filename) # ディレクトリチェック if not (os.path.exists(HOZONBASYO + '/' + albname)): os.makedirs(HOZONBASYO + '/' + albname) if not (os.path.exists(HOZONBASYO + '/' + albname + '/' + filename + '.npy')): cqt = np.load(cqtfile) excqt = np.vstack((cqt, cqt)) np.save(HOZONBASYO + '/' + albname + '/' + filename + '.npy', np.array(excqt, dtype="float32")) if not (os.path.exists(HOZONBASYO2 + '/' + albname)): os.makedirs(HOZONBASYO2 + '/' + albname) if not (os.path.exists(HOZONBASYO2 + '/' + albname + '/' + filename + '.npy')): cqt = np.load(cqtfile) excqt = np.vstack((cqt, cqt, cqt)) np.save(HOZONBASYO2 + '/' + albname + '/' + filename + '.npy', np.array(excqt, dtype="float32")) i_counter += 1
def __init__(self, roots, sample_rate, max_time, target_level, noise_proportion, snrs, **kwargs): self.sample_rate = sample_rate self.max_time = max_time self.target_level = target_level self.noise_proportion = noise_proportion self.snrs = snrs self.filepths = [] for root in roots: self.filepths += find_files(root) assert len(self.filepths) > 0, 'No audio file detected' self.noise_sampler = torch.distributions.Normal(0, 1)
def LoadSpectrogram(target="vocal") : filelist = find_files('./Spectrogram', ext="npz") x_list = [] y_list = [] for file in filelist : data = np.load(file) x_list.append(data['mix']) if target == "vocal" : y_list.append(data['vocal']) else : y_list.append(data['inst']) return x_list, y_list
def LoadDataset(target="vocal"): filelist_fft = find_files(C.PATH_FFT, ext="npz")[:200] Xlist = [] Ylist = [] for file_fft in filelist_fft: dat = np.load(file_fft) Xlist.append(dat["mix"]) if target == "vocal": assert(dat["mix"].shape == dat["vocal"].shape) Ylist.append(dat["vocal"]) else: assert(dat["mix"].shape == dat["inst"].shape) Ylist.append(dat["inst"]) return Xlist, Ylist
def get_ann_audio(directory): '''Get a list of annotations and audio files from a directory. This also validates that the lengths match and are paired properly. Parameters ---------- directory : str The directory to search Returns ------- pairs : list of tuples (audio_file, annotation_file) ''' audio = find_files(directory) annos = find_files(directory, ext=['jams', 'jamz']) paired = list(zip(audio, annos)) if len(audio) != len(annos) or any([base(aud) != base(ann) for aud, ann in paired]): raise DataError('Unmatched audio/annotation data in {}'.format(directory)) return paired
# -*- coding: utf-8 -*- """ Created on Sat May 7 16:45:29 2016 @author: parallels """ import functions from librosa.util import find_files from librosa.core import load audiofilelist = find_files("database/audios/",ext = "wav") print "saving peaks...." for audiofile in audiofilelist: y,sr = load(audiofile) filename = audiofile.split("/")[-1]+".npy" # -1 means the last name of the directory functions.save_maximum_array(y,filename) print "saved:" + filename
""" Created on Fri Nov 3 10:59:08 2017 @author: wuyiming """ import os from librosa.core import load from librosa.util import find_files import yaml import util PATH_MENDLEY = "MedleyDB/Audio" metadatalist = find_files(PATH_MENDLEY, ext="yaml") all_voctracks = [] all_insttracks = [] for metafile in metadatalist: print("YAML file: %s" % metafile) songname = metafile.split("/")[-2] print("song: %s" % songname) with open(metafile, "r+") as f: data = yaml.load(f) if data["instrumental"] != "no": print("Instrumental track. Skipped.") continue