def time_stretch_hpss(audio, f): if f == 1.0: return audio stft = core.stft(audio) # Perform HPSS stft_harm, stft_perc = decompose.hpss( stft, kernel_size=31) # original kernel size 31 # OLA the percussive part # make sure the signals properly overlap y_perc = librosa.util.fix_length(core.istft(stft_perc, dtype=audio.dtype), len(audio)) y_perc = time_stretch_sola(y_perc, f, wsola=True) #~ # Phase-vocode the harmonic part stft_stretch = core.phase_vocoder(stft_harm, 1.0 / f) #~ # Inverse STFT of harmonic y_harm = librosa.util.fix_length( core.istft(stft_stretch, dtype=y_perc.dtype), len(y_perc)) # y_harm = librosa.util.fix_length(core.istft(stft_harm, dtype=audio.dtype), len(audio)) # y_harm = librosa.util.fix_length(time_stretch_sola(core.istft(stft_harm, dtype=audio.dtype), f, wsola = True), len(y_perc)) # Add them together return y_harm + y_perc
def get_data(train=True): batch_out = [] interval = int(self.clip_sec * self.samplerate) * 2 for batch_idx in range(self.batch_size): if train: rec_idx = np.random.randint(len(train_data_wav)) crop_idx = np.random.randint(len(train_data_wav[rec_idx][0]) - interval) sources = [i[crop_idx:crop_idx + interval] for i in train_data_wav[rec_idx]] else: rec_idx = np.random.randint(len(test_data_wav)) crop_idx = np.random.randint(len(test_data_wav[rec_idx][0]) - interval) sources = [i[crop_idx:crop_idx + interval] for i in test_data_wav[rec_idx]] if config.pitch_aug and train: n_steps = pitch_shift_list[np.random.randint(len(pitch_shift_list))] if not n_steps==0: sources = [pitch_shift(i, self.samplerate, n_steps=n_steps) for i in sources] sources = [from_polar(to_stft(i, self.nfft)) for i in sources] if config.bpm_aug and train: rate = stretch_rate_list[np.random.randint(len(stretch_rate_list))] if not rate==1.0: for i in range(len(sources)): augmented = phase_vocoder(sources[i][:, :, 0] + 1j * sources[i][:, :, 1], rate=rate) sources[i] = np.array([np.real(augmented), np.imag(augmented)]).transpose(1, 2, 0) if config.amp_aug and train: sources = [i * (0.75 + (np.random.random() * 0.5)) for i in sources] sources = random_crop(sources, self.ydim) batch_out.append(sources) batch_out = np.array(batch_out).transpose(1, 0, 2, 3, 4) if train and true_wp(config.shuffle_sources_aug_prob) == 1.0: for source_i in range(self.num_sources): np.random.shuffle(batch_out[source_i]) return batch_out
def eval(net1, net2, speech_file_loc, melody_file_loc): # Evaluates the result of net1, net2 on a given speech file and melody file # speech_file_loc, melody_file_loc are strings that specify the location of the respective audio files network1, network2 = net1.eval(), net2.eval() # Read input audio orig_speech = core.load(speech_file_loc, sr)[0] inp_speech = DL.remove_silent_frames(orig_speech) #inp_speech = 1.0 * orig_speech stft_inp = core.stft(inp_speech, n_fft=nfft, hop_length=hop, win_length=wlen) # Extract melody and create its image melody = utils.MelodyExt.melody_extraction(melody_file_loc, 'runtime_folder/ref_melody')[0] ref_pc = melody[:, 1] ref_time = melody[:, 0] const = hop * 1.0 / sr new_sampling_time = np.arange(const, ref_time[-1], const) interp_melody = np.interp(new_sampling_time, ref_time, ref_pc) n_frames = new_sampling_time.shape[0] idx1 = (1.0 * interp_melody * nfft / sr).astype(int) idx2 = np.array(range(n_frames)) pc = np.zeros([1 + nfft / 2, n_frames]) pc[idx1, idx2] = 1 pc[-1] = 1 * pc[0] pc[0] = 0 * pc[0] # Complete input preprocessing rate = stft_inp.shape[1] * 1.0 / n_frames stft_inp = core.phase_vocoder(stft_inp, rate, hop) # Stretch input speech to target length n_frames += 8 - n_frames % 8 # Append zeros to make it suitable for network stft_inp = np.concatenate([ stft_inp, np.zeros([stft_inp.shape[0], n_frames - stft_inp.shape[1]]) ], axis=1) pc = np.concatenate( [pc, np.zeros([pc.shape[0], n_frames - pc.shape[1]])], axis=1) stft_inp = np.log(1 + np.abs(stft_inp)) stft_inp, pc = torch.from_numpy(stft_inp).float().unsqueeze( 0), torch.from_numpy(pc).float().unsqueeze(0) # Make tensors # Extract output encode2 = network2(Variable(pc.to(device))) pred, encode1 = network1(Variable(stft_inp.to(device)), encode2) pred = pred[0].cpu().data.numpy() pred[pred < 0] = 0 pred = np.exp(pred) - 1 time_pred = 3.0 * utils.gl_rec(pred, hop, wlen, core.istft( pred, hop, wlen)) # Adding a multiplier to increase loudness return time_pred
def change_speed(input_signal, rate): """Change the playback speed of an audio signal Parameters ---------- input_signal : numpy.array Input array, must have numerical type. rate : numeric Desired rate of change to the speed. To increase the speed, pass in a value greater than 1.0. To decrease the speed, pass in a value between 0.0 and 1.0. Returns ------- numpy.array representing the audio signal with changed speed. """ if input_signal.dtype.kind not in 'iu' and input_signal.dtype.kind != 'f': raise TypeError( "'input_signal' must be an array of integers or floats") if rate <= 0: raise Exception('rate must be a positive number') # Convert input signal to a -1.0 to 1.0 float if it's an integer type if input_signal.dtype.kind in 'iu': i = np.iinfo('float32') abs_max = 2**(i.bits - 1) offset = i.min + abs_max input_signal = (input_signal.astype('float32') - offset) / abs_max # Transform signal to frequency domain frequency_domain_signal = core.stft(input_signal) # Change speed with the phase vocoding method fds_changed_speed = core.phase_vocoder(frequency_domain_signal, rate) # Transform frequency domain signal back to time domain output_signal = core.istft(fds_changed_speed, dtype=input_signal.dtype) return output_signal
def __getitem__(self, samp_info): usr = samp_info[0] # Which user snum = samp_info[1] # Which song of the user inp_start = float( samp_info[4]) * self.sr # Start index of the time-domian signal inp_end = float( samp_info[5]) * self.sr # End index of the time-domain signal lines_read = samp_info[6] lines_sung = samp_info[7] inp_audio = np.array([]) file_path = self.root_dir + str(usr) + '/read/' + str(snum) + '.wav' inp_audio = self.all_audio[file_path][int(inp_start):int(inp_end)] inp_audio = remove_silent_frames(inp_audio) rps = np.random.uniform(-1.0, 1.0) inp_rps = librosa.effects.pitch_shift(inp_audio, self.sr, n_steps=rps) stft_inp = core.stft(inp_audio, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen) stft_rps = core.stft(inp_rps, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen) out_start = float( samp_info[2]) * self.sr # Start index of the time signal out_end = float(samp_info[3]) * self.sr # End index of the time signal file_path = self.root_dir + str(usr) + '/sing/' + str(snum) + '.wav' #out_audio = core.load(file_path, self.sr)[0][int(out_start):int(out_end)] out_audio = self.all_audio[file_path][int(out_start):int(out_end)] out_rps = librosa.effects.pitch_shift(out_audio, self.sr, n_steps=rps) stft_out = core.stft(out_audio, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen) stft_rps_out = core.stft(out_rps, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen) rate = stft_inp.shape[1] * 1.0 / stft_out.shape[1] stft_inp_orig = 1 * stft_inp stft_inp = core.phase_vocoder(stft_inp, rate, self.hop) stft_inp = stft_inp[:, :stft_out.shape[1]] stft_rps = core.phase_vocoder(stft_rps, rate, self.hop) stft_rps = stft_rps[:, :stft_out.shape[1]] #phn_matrix = np.zeros([len(cmu_phn), stft_out.shape[1]]) phn_matrix = np.zeros(stft_out.shape[1]).astype(int) hop_time = (self.hop * 1.0 / self.sr) for idx in range(0, len(lines_read)): phn_start, phn_end = extract_time(lines_sung[idx]) if (lines_sung[idx][-3] == ' '): cur_phn = lines_sung[idx][-2:-1] elif (lines_sung[idx][-4] == ' '): cur_phn = lines_sung[idx][-3:-1] if (cur_phn[-1] == ' '): cur_phn = cur_phn[0] start_time = float(samp_info[2]) end_time = float(samp_info[3]) if ( phn_end - phn_start > 0.005 ): # Just a check that the phone should sustain for more than a few ms start_idx = int((phn_start - start_time) / hop_time) end_idx = int((phn_end - start_time) / hop_time) #print start_idx, end_idx, cur_phn, phn_matrix.shape #phn_matrix[:, start_idx:end_idx] = np.tile(phn_dict[cur_phn], (end_idx-start_idx, 1)).transpose() phn_matrix[start_idx:end_idx] = int(phn_dict[cur_phn]) return [ np.abs(stft_inp), np.abs(stft_out), pitch_max(np.abs(stft_inp)), pitch_pyin(int(out_start), usr, snum, stft_out.shape), rate, stft_inp_orig, stft_inp, stft_out, np.abs(stft_rps), pitch_max(np.abs(stft_rps)), np.abs(stft_rps_out), self.fld.index(usr), phn_matrix ]
def __getitem__(self, samp_info): #print samp_info[0], samp_info[1], samp_info[2], samp_info[3] usr = samp_info[0] # Which user snum = samp_info[1] # Which song out of user's 4 songs inp_start = float( samp_info[4]) * self.sr # Start index of the time-domian signal inp_end = float( samp_info[5]) * self.sr # End index of the time-domain signal lines_read = samp_info[6] lines_sung = samp_info[7] inp_audio = np.array([]) file_path = self.root_dir + str(usr) + '/read/' + str(snum) + '.wav' #''' inp_full = self.all_audio[file_path] for idx in range(0, len(lines_read)): r_start, r_end = extract_time(lines_read[idx]) s_start, s_end = extract_time(lines_sung[idx]) stretch_rate = (r_end - r_start) / (1e-3 + s_end - s_start) #print r_end, r_start, s_end, s_start inp_phn = inp_full[int(r_start * self.sr):int(r_end * self.sr)] inp_phn_stretch = librosa.effects.time_stretch( inp_phn, stretch_rate) inp_audio = np.append(inp_audio, inp_phn_stretch) rps = np.random.uniform(-1.0, 1.0) #rps = 3.0 inp_rps = librosa.effects.pitch_shift(inp_audio, self.sr, n_steps=rps) stft_inp = core.stft(inp_audio, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen) stft_rps = core.stft(inp_rps, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen) out_start = float( samp_info[2]) * self.sr # Start index of the time signal out_end = float(samp_info[3]) * self.sr # End index of the time signal file_path = self.root_dir + str(usr) + '/sing/' + str(snum) + '.wav' #out_audio = core.load(file_path, self.sr)[0][int(out_start):int(out_end)] out_audio = self.all_audio[file_path][int(out_start):int(out_end)] out_rps = librosa.effects.pitch_shift(out_audio, self.sr, n_steps=rps) stft_out = core.stft(out_audio, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen) stft_rps_out = core.stft(out_rps, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen) # Making input also length of 3 seconds (if not wanted comment next 2 lines) rate = stft_inp.shape[1] * 1.0 / stft_out.shape[1] stft_inp_orig = 1 * stft_inp stft_inp = core.phase_vocoder(stft_inp, rate, self.hop) stft_inp = stft_inp[:, :stft_out.shape[1]] stft_rps = core.phase_vocoder(stft_rps, rate, self.hop) stft_rps = stft_rps[:, :stft_out.shape[1]] #phn_matrix = np.zeros([len(cmu_phn), stft_out.shape[1]]) phn_matrix = np.zeros(stft_out.shape[1]).astype(int) hop_time = (hop * 1.0 / sr) for idx in range(0, len(lines_read)): phn_start, phn_end = extract_time(lines_sung[idx]) if (lines_sung[idx][-3] == ' '): cur_phn = lines_sung[idx][-2:-1] elif (lines_sung[idx][-4] == ' '): cur_phn = lines_sung[idx][-3:-1] if (cur_phn[-1] == ' '): cur_phn = cur_phn[0] start_time = float(samp_info[2]) end_time = float(samp_info[3]) if (phn_end - phn_start > 0.005): start_idx = int((phn_start - start_time) / hop_time) end_idx = int((phn_end - start_time) / hop_time) #print start_idx, end_idx, cur_phn, phn_matrix.shape #phn_matrix[:, start_idx:end_idx] = np.tile(phn_dict[cur_phn], (end_idx-start_idx, 1)).transpose() phn_matrix[start_idx:end_idx] = int(phn_dict[cur_phn]) return [ np.abs(stft_inp), np.abs(stft_out), pitch_max(np.abs(stft_inp)), pitch_pyin(int(out_start), usr, snum, stft_out.shape), rate, stft_inp_orig, stft_inp, stft_out, np.abs(stft_rps), pitch_max(np.abs(stft_rps)), np.abs(stft_rps_out), self.fld.index(usr), phn_matrix ] # Input, Output, Original Input-Output length ratio
def __getitem__(self, index): audio_list = [] tf = self.id_list[index] read_file = self.get_read(tf) read_audio = f'{read_file[:-4]}.wav' song_audio = f'{tf[:-4]}.wav' with open(tf, 'rb') as f: song_txt = f.read().splitlines() with open(read_file, 'rb') as f: read_txt = f.read().splitlines() melody_name = "_".join([tf.split('/')[-3], tf.split('/')[-1]])[:-4] melody = np.load(f'../sp2si-code/melody_contour/{melody_name}.npy') index_begin = randint(0, len(read_txt) - 40) + 1 index_end = index_begin + 10 song_begin = float(song_txt[index_begin].split()[0]) song_end = float(song_txt[index_end].split()[1]) song_dur = song_end - song_begin while song_dur < 5 and index_end < len(read_txt) - 2: index_end += 1 song_end = float(song_txt[index_end].split()[1]) song_dur = song_end - song_begin while song_dur > 8: index_end -= 1 song_end = float(song_txt[index_end].split()[1]) song_dur = song_end - song_begin read_begin = float(read_txt[index_begin].split()[0]) read_end = float(read_txt[index_end].split()[1]) read_dur = read_end - read_begin read_audio = core.load(read_audio, sr=self.sr, mono=True, offset=read_begin, duration=read_dur)[0] song_audio = core.load(song_audio, sr=self.sr, mono=True, offset=song_begin, duration=song_dur)[0] read_stft = core.stft(read_audio, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen) song_stft = core.stft(song_audio, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen) song_stft = song_stft[..., :-1] melody = torch.from_numpy(melody).unsqueeze(0).unsqueeze(0) melody = F.interpolate(melody, scale_factor=(22050 / 16000 / 2), mode='nearest') melody = torch.squeeze(melody) pitch = melody[int(song_begin * self.sr / self.hop):int(song_begin * self.sr / self.hop) + song_stft.shape[1]] pitch = pitch.cpu().numpy() pitch = librosa.core.hz_to_midi(pitch) rate = read_stft.shape[1] / song_stft.shape[1] read_stft = core.phase_vocoder(read_stft, rate, self.hop) read_stft = read_stft[:, :song_stft.shape[1]] read_stft = abs(read_stft) song_stft = abs(song_stft) if args.feat_type == "mel": read_stft = np.matmul(mel_basis, read_stft) song_stft = np.matmul(mel_basis, song_stft) read_mag = np.log10(np.clip((read_stft), a_min=1e-5, a_max=100000)) song_mag = np.log10(np.clip((song_stft), a_min=1e-5, a_max=100000)) return song_mag, read_mag, pitch, read_audio
def __getitem__(self, index): audio_list = [] tf = self.id_list[index] read_file = self.get_read(tf) read_audio = f'{read_file[:-4]}.wav' song_audio = f'{tf[:-4]}.wav' with open(tf, 'rb') as f: song_txt = f.read().splitlines() with open(read_file, 'rb') as f: read_txt = f.read().splitlines() melody_name = "_".join([tf.split('/')[-3], tf.split('/')[-1]])[:-4] melody = np.load(f'../sp2si-code/melody_contour/{melody_name}.npy') index_begin = randint(0,len(read_txt) - 40) + 1 index_end = index_begin + 30 song_begin = float(song_txt[index_begin].split()[0]) song_end = float(song_txt[index_end].split()[1]) song_dur = song_end - song_begin while song_dur < 7 and index_end < len(read_txt) - 2: index_end += 1 song_end = float(song_txt[index_end].split()[1]) song_dur = song_end - song_begin while song_dur > 10 : index_end -= 1 song_end = float(song_txt[index_end].split()[1]) song_dur = song_end - song_begin read_begin = float(read_txt[index_begin].split()[0]) read_end = float(read_txt[index_end].split()[1]) read_dur = read_end - read_begin read_audio = core.load(read_audio, sr=self.sr, mono=True, offset=read_begin, duration=read_dur)[0] song_audio = core.load(song_audio, sr=self.sr, mono=True, offset=song_begin, duration=song_dur)[0] index_sing_dur = [] index_read_dur = [] t = self.sr rsi = float(read_txt[index_begin].split()[0]) ssi = float(song_txt[index_begin].split()[0]) i = index_begin while i < index_end+1: s_begin, s_end, _ = song_txt[i].split() r_begin, r_end, _ = read_txt[i].split() s_begin, s_end, r_begin, r_end = float(s_begin), float(s_end), float(r_begin), float(r_end) while (s_end - s_begin< 0.2 or r_end - r_begin < 0.2) and i<index_end + 1: i = i + 1 _, s_end, _ = song_txt[i].split() _, r_end, _ = read_txt[i].split() s_end , r_end = float(s_end), float(r_end) index_read_dur += [((r_begin-rsi)*t , (r_end-rsi)*t) ] index_sing_dur += [((s_begin-ssi)*t , (s_end-ssi)*t)] i = i + 1 read_audio_list = [] for i in range(len(index_read_dur)): r_begin, r_end = index_read_dur[i] s_begin, s_end = index_sing_dur[i] if r_end - r_begin ==0 or s_end - s_begin == 0: read_audio_list += [read_audio[int(r_begin):int(r_end)]] continue rate = (s_end - s_begin)/(r_end - r_begin) read_audio_slice = librosa.effects.time_stretch(read_audio[int(r_begin):int(r_end)], 1/rate) read_audio_list += [read_audio_slice] read_audio = np.concatenate(read_audio_list, axis=0) read_stft = core.stft(read_audio, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen) song_stft = core.stft(song_audio, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen) song_stft = song_stft[...,:-1] rate = read_stft.shape[1] / song_stft.shape[1] read_stft = core.phase_vocoder(read_stft, rate, self.hop) read_stft = read_stft[:, :song_stft.shape[1]] read_stft = abs(read_stft) song_stft = abs(song_stft) if args.feat_type == "mel": read_stft = np.matmul(mel_basis, read_stft) song_stft = np.matmul(mel_basis, song_stft) read_mag = np.log10(np.clip((read_stft), a_min=1e-5, a_max=100000)) song_mag = np.log10(np.clip((song_stft), a_min=1e-5, a_max=100000)) return song_mag, read_mag, read_audio