def pysptk_mfcc(self): self.frame_length = 1024 self.hop_length = 80 self.pitch = pysptk.swipe(self.audio.astype(np.float64), fs=self.sr, hopsize=self.hop_length, min=60, max=240, otype="pitch") self.source_excitation = pysptk.excite(self.pitch, self.hop_length) # Note that almost all of pysptk functions assume input array is C-contiguous and np.float4 element type frames = librosa.util.frame(self.audio, frame_length=self.frame_length, hop_length=self.hop_length).astype( np.float64).T # Windowing frames *= pysptk.blackman(self.frame_length) assert frames.shape[1] == self.frame_length # Order of mel-cepstrum self.order = 25 self.alpha = 0.41 self.mc = pysptk.mcep(frames, self.order, self.alpha) logH = pysptk.mgc2sp(self.mc, self.alpha, 0.0, self.frame_length).real librosa.display.specshow(logH.T, sr=self.sr, hop_length=self.hop_length, x_axis="time", y_axis="linear")
def __call__(self, pkg, cached_file=None): pkg = format_package(pkg) wav = pkg['chunk'] wav = wav.data.numpy() max_frames = wav.shape[0] // self.hop if cached_file is not None: # load pre-computed data proso = torch.load(cached_file) beg_i = pkg['chunk_beg_i'] // self.hop end_i = pkg['chunk_end_i'] // self.hop proso = proso[:, beg_i:end_i] pkg[self.name] = proso else: # first compute logF0 and voiced/unvoiced flag # f0 = pysptk.rapt(wav.astype(np.float32), # fs=self.sr, hopsize=self.hop, # min=self.f0_min, max=self.f0_max, # otype='f0') f0 = pysptk.swipe(wav.astype(np.float64), fs=self.sr, hopsize=self.hop, min=self.f0_min, max=self.f0_max, otype='f0') # sound = pm.Sound(wav.astype(np.float32), self.sr) # f0 = sound.to_pitch(self.hop / 16000).selected_array['frequency'] if len(f0) < max_frames: pad = max_frames - len(f0) f0 = np.concatenate((f0, f0[-pad:]), axis=0) lf0 = np.log(f0 + 1e-10) lf0, uv = interpolation(lf0, -1) lf0 = torch.tensor(lf0.astype(np.float32)).unsqueeze(0)[:, :max_frames] uv = torch.tensor(uv.astype(np.float32)).unsqueeze(0)[:, :max_frames] if torch.sum(uv) == 0: # if frame is completely unvoiced, make lf0 min val lf0 = torch.ones(uv.size()) * np.log(self.f0_min) # assert lf0.min() > 0, lf0.data.numpy() # secondly obtain zcr zcr = librosa.feature.zero_crossing_rate(y=wav, frame_length=self.win, hop_length=self.hop) zcr = torch.tensor(zcr.astype(np.float32)) zcr = zcr[:, :max_frames] # finally obtain energy egy = librosa.feature.rmse(y=wav, frame_length=self.win, hop_length=self.hop, pad_mode='constant') egy = torch.tensor(egy.astype(np.float32)) egy = egy[:, :max_frames] proso = torch.cat((lf0, uv, egy, zcr), dim=0) if self.der_order > 0 : deltas=[proso] for n in range(1,self.der_order+1): deltas.append(librosa.feature.delta(proso.numpy(),order=n)) proso=torch.from_numpy(np.concatenate(deltas)) pkg[self.name] = proso # Overwrite resolution to hop length pkg['dec_resolution'] = self.hop return pkg
def get_coefs(wav_file_path): sample_rate, x = wavfile.read(wav_file_path) # al.play(x.astype(float) / x.max(), fs=sample_rate) frames = librosa.util.frame( x, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T frames *= pysptk.blackman(frame_length) f0 = pysptk.swipe( x.astype(np.float64), fs=sample_rate, hopsize=hop_length, min=50, max=500) # order = 40 # alpha = 0.41 mc = np.apply_along_axis( pysptk.mcep, 1, frames, order, alpha) return sample_rate, f0, mc # sample_rate- ?, f0, mel-cepstrum coefs
def cal_prosody(self,wav): # Input: wav: audio signal in numpy.array format # Output: proso: Tensor, [max_frames, 4] max_frames = wav.shape[0] // self.hop f0 = pysptk.swipe(wav.astype(np.float64), fs=self.sr, hopsize=self.hop, min=self.f0_min, max=self.f0_max, otype='f0') lf0 = np.log(f0 + 1e-10) lf0, uv = self.interpolation(lf0, -1) lf0 = torch.tensor(lf0.astype(np.float32)).unsqueeze(0)[:, :max_frames]# (1,num_frame) uv = torch.tensor(uv.astype(np.float32)).unsqueeze(0)[:, :max_frames] if torch.sum(uv) == 0: # if frame is completely unvoiced, make lf0 min val lf0 = torch.ones(uv.size()) * np.log(self.f0_min) assert lf0.min() > 0, lf0.data.numpy() # secondly obtain zcr zcr = librosa.feature.zero_crossing_rate(y=wav, frame_length=self.win, hop_length=self.hop) zcr = torch.tensor(zcr.astype(np.float32)) zcr = zcr[:, :max_frames] # finally obtain energy egy = librosa.feature.rms(y=wav, frame_length=self.win, hop_length=self.hop, pad_mode='constant') egy = torch.tensor(egy.astype(np.float32)) egy = egy[:, :max_frames] proso = torch.cat((lf0, uv, egy, zcr), dim=0).unsqueeze(0)#(1,4,num_frame) return proso
def pysptk_featurize(audiofile): labels = list() features = list() fs, x = wavfile.read(audiofile) f0_swipe = pysptk.swipe(x.astype(np.float64), fs=fs, hopsize=80, min=60, max=200, otype="f0") features = features + stats(f0_swipe) labels = stats_labels('f0_swipe', labels) f0_rapt = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=80, min=60, max=200, otype="f0") features = features + stats(f0_rapt) labels = stats_labels('f0_rapt', labels) mgc = pysptk.mgcep(xw, 20, 0.0, 0.0) features = features + stats(mgc) labels = stats_labels('mel-spectrum envelope', labels) return features, labels
def test_swipe_regression(): # Grund truth data is generated by: # # $ wav2raw pysptk/example_audio_data/arctic_a0007.wav # # $ x2x +sf ./pysptk/example_audio_data/arctic_a0007.raw | \ # pitch -a 1 -s 16 -p 80 -L 60 -H 240 -o 0 > \ # arctic_a007_p16_L60_H240_o0_swipe.pitch # # $ dmp +f arctic_a007_p16_L60_H240_o0_swuoe.pitch | awk '{print $2}' >\ # arctic_a007_p16_L60_H240_o0_swipe.txt # # $ pitch -h # ... # # SPTK: version 3.10 # CVS Info: $Id: pitch.c,v 1.53 2016/12/25 05:00:19 uratec Exp $ ground_truth_path = join( dirname(__file__), "data", "arctic_a007_p16_L60_H240_o0_swipe.txt" ) with open(ground_truth_path) as f: ground_truth = np.asarray([float(s) for s in [line for line in f.readlines()]]) ground_truth = ground_truth.astype(np.float32) fs, x = wavfile.read(pysptk.util.example_audio_file()) assert fs == 16000 # Since SPTK might have memory corruption bug and the result might be # non-deterministic, test it with multiple time... for _ in range(5): f0 = pysptk.swipe( x.astype(np.float64), fs=fs, hopsize=80, min=60, max=240, otype=0 ) assert np.allclose(ground_truth, f0)
def source_excitation_generation(np_data, rate): pitch = ps.swipe(np_data.astype(np.float64), fs=rate, hopsize=HOP_LENGTH, min=60, max=240, otype="pitch") source_excitation = ps.excite(pitch, HOP_LENGTH) return source_excitation
def F0_swipe( waveform, hop_length=None, sr=None, hop_time=None, f_min=60, # default in swipe f_max=240, # default in swipe threshold=0.5, # custom defualt (0.3 in swipe) ): if hop_length is not None: hopsize = hop_length else: hopsize = int(sr * hop_time) if waveform.ndim == 1: return torch.from_numpy( swipe( waveform.contiguous().double().numpy(), fs=sr, hopsize=hopsize, min=f_min, max=f_max, threshold=threshold, otype="f0", )).float() elif waveform.ndim == 2: # (B, N) f0 = [] for audio in waveform: f0.append( torch.from_numpy( swipe( audio.contiguous().double().numpy(), fs=sr, hopsize=hopsize, min=f_min, max=f_max, threshold=threshold, otype="f0", )).float()) return torch.stack(f0)
def get_synt_wav(wav_file_path): # # Synthesis from mel-cepstrum sample_rate, x = wavfile.read(wav_file_path) # assert sample_rate == 16000 # al.play(x.astype(float) / x.max(), fs=sample_rate) # Audio(x, rate=sample_rate) # all of pysptk functions assume input array is C-contiguous # and np.float4 element type frames = librosa.util.frame( x, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T # Windowing frames *= pysptk.blackman(frame_length) # assert frames.shape[1] == frame_length # F0 estimation f0 = pysptk.swipe( x.astype(np.float64), fs=sample_rate, hopsize=hop_length, min=50, max=500) generator = excite.ExcitePulse(sample_rate, hop_length, False) source_excitation = generator.gen(f0) # apply function along with `time` axis (=1) mc = np.apply_along_axis( pysptk.mcep, 1, frames, order, alpha) # Convert mel-cesptrum to MLSADF coefficients b = np.apply_along_axis(pysptk.mc2b, 1, mc, alpha) synthesizer = pysptk.synthesis.Synthesizer( pysptk.synthesis.MLSADF( order=order, alpha=alpha), hop_length) x_synthesized = synthesizer.synthesis(source_excitation, b) # Audio(x_synthesized, rate=sample_rate) # al.play(x_synthesized.astype(float) / x_synthesized.max(), fs=sample_rate) return x_synthesized
def pitch_detect(sndarray, fs, chunk_size): """ pitch_detect(sndarray,fs, chunk_size) pitch_detect computes the fundamental frequency/pitches of blocks/ of Chunks Parameters:sndarray - Discrete Data fs -Sampling frequency chunk_size Returns f0 """ new_sndarray = numpy.asarray(numpy.float64(sndarray)) f0 = pysptk.swipe(numpy.asarray(new_sndarray), fs, chunk_size, 65, 500, 0.001, 1) return f0
def __call__(self, pkg, cached_file=None): pkg = format_package(pkg) wav = pkg['chunk'] wav = wav.data.numpy() max_frames = wav.shape[0] // self.hop if cached_file is not None: # load pre-computed data proso = torch.load(cached_file) beg_i = pkg['chunk_beg_i'] // self.hop end_i = pkg['chunk_end_i'] // self.hop proso = proso[:, beg_i:end_i] pkg['prosody'] = proso else: # first compute logF0 and voiced/unvoiced flag f0 = pysptk.swipe(wav.astype(np.float64), fs=self.sr, hopsize=self.hop, min=self.f0_min, max=self.f0_max, otype='f0') lf0 = np.log(f0 + 1e-10) lf0, uv = interpolation(lf0, -1) lf0 = torch.tensor(lf0.astype( np.float32)).unsqueeze(0)[:, :max_frames] uv = torch.tensor(uv.astype( np.float32)).unsqueeze(0)[:, :max_frames] if torch.sum(uv) == 0: # if frame is completely unvoiced, make lf0 min val lf0 = torch.ones(uv.size()) * np.log(self.f0_min) assert lf0.min() > 0, lf0.data.numpy() # secondly obtain zcr zcr = librosa.feature.zero_crossing_rate(y=wav, frame_length=self.win, hop_length=self.hop) zcr = torch.tensor(zcr.astype(np.float32)) zcr = zcr[:, :max_frames] # finally obtain energy egy = librosa.feature.rmse(y=wav, frame_length=self.win, hop_length=self.hop, pad_mode='constant') egy = torch.tensor(egy.astype(np.float32)) egy = egy[:, :max_frames] proso = torch.cat((lf0, uv, egy, zcr), dim=0) pkg['prosody'] = proso return pkg
def pysptk_features(x): import pysptk wav_max = 2**15-1 x = (x * wav_max).astype(np.float64) frame_length = 512 hop_length = 160 frames = librosa.util.frame(x, frame_length=frame_length, hop_length=hop_length).T frames *= pysptk.blackman(frame_length) order = 25 # seems to be pretty standard, results in 26 values alpha = 0.42 # this value is best for 16kHz sampling according to docs http://ftp.jaist.ac.jp/pub/pkgsrc/distfiles/SPTKref-3.9.pdf mcep = pysptk.mcep(frames, order, alpha) f0 = pysptk.swipe(x, fs=16000, hopsize=hop_length, min=60, max=240, otype="f0") f0 = f0[1:1+mcep.shape[0]] # cut off ends to match mcep lengths return np.concatenate([f0[:,np.newaxis], mcep], 1).astype(np.float32)
def process_audio(self, x): pitch = pysptk.swipe(x, fs=self.sr, hopsize=self.hop_length, min=self.f0_floor, max=self.f0_ceil, otype="pitch") f0, timeaxis = pyworld.dio(x, fs=self.sr, f0_floor=self.f0_floor, f0_ceil=self.f0_ceil, frame_period=self.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, self.sr) # x_frame = self.samples_to_frames(x) if self.use_mel: mel = librosa.feature.melspectrogram(x, sr=self.sr, n_fft=self.fft_length, hop_length=self.hop_length) mfcc = librosa.feature.mfcc(S=mel, sr=self.sr, n_mfcc=self.n_mfcc) if self.norm_mfcc: mfcc = self.standardize_mfcc(mfcc) return {'mel': mel.T, 'mfcc': mfcc.T, 'f0': f0, 'pitch': pitch} else: ap = pyworld.d4c(x, f0, timeaxis, self.sr, fft_size=self.fft_length) # Aperiodicity sp = pyworld.cheaptrick(x, f0, timeaxis, self.sr, fft_size=self.fft_length) return {'sp': sp, 'ap': ap, 'f0': f0, 'pitch': pitch}
def get_MCEP(self, utterance): utterance = librosa.util.normalize(utterance) utterance = utterance + np.random.normal( loc=0, scale=0.0000001, size=utterance.shape[0]) utterance = librosa.util.normalize(utterance) utterance = utterance.astype(np.float64) # necessary for synthesizer frames = librosa.util.frame(utterance, frame_length=self.frame_length, hop_length=self.hop_length).astype( np.float64).T # Windowing frames *= pysptk.blackman(self.frame_length) assert frames.shape[1] == self.frame_length # Pitch pitch = pysptk.swipe(utterance.astype(np.float64), fs=self.sr, hopsize=self.hop_length, min=60, max=240, otype="pitch") mcep = pysptk.mcep(frames, self.order, self.alpha) return mcep, pitch
f.write("order,time\n") for order in (0, 4, 9, 14, 24, 49): start = time.time() # Note that almost all of pysptk functions assume input array is C-contiguous and np.float64 element type frames = librosa.util.frame(x, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T # Windowing frames *= pysptk.blackman(frame_length) assert frames.shape[1] == frame_length pitch = pysptk.swipe(x.astype(np.float64), fs=sr, hopsize=hop_length, min=60, max=240, otype="pitch") source_excitation = pysptk.excite(pitch, hop_length) # Order of mel-cepstrum mc = pysptk.mcep(frames, order, alpha) logH = pysptk.mgc2sp(mc, alpha, 0.0, frame_length).real print(mc.shape) #plt.plot(mc) #plotname="x_syn_coefs_" + str(order) + ".png" #plt.savefig(plotname) # Convert mel-cesptrum to MLSADF coefficients b = pysptk.mc2b(mc, alpha)
def __test(x, fs, hopsize, otype): pysptk.swipe(x, fs, hopsize, otype=otype)
def __test(x, fs, hopsize, otype): f0 = pysptk.swipe(x, fs, hopsize, otype=otype) assert np.all(np.isfinite(f0)) if otype == 1: assert np.all(f0 >= 0)
OUT_WAVE_FILE = "out.wav" # 分析再合成した音声 # 音声の読み込み fs, x = wavfile.read(IN_WAVE_FILE) x = x.astype(np.float64) # 音声の切り出しと窓掛け frames = librosa.util.frame(x, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH).astype(np.float64).T frames *= pysptk.blackman(FRAME_LENGTH) # 窓掛け(ブラックマン窓) # ピッチ抽出 pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # 線形予測分析による線形予測符号化(LPC)係数の抽出 lpc = pysptk.lpc(frames, ORDER) lpc[:, 0] = np.log(lpc[:, 0]) # LPC係数をPARCOR係数に変換 parcor = pysptk.lpc2par(lpc) # 全極フィルタの作成 synthesizer = Synthesizer(AllPoleLatticeDF(order=ORDER), HOP_LENGTH)
def get_target_feats(utterance, utterance_wav, alignments): '''Generates target features for an utterance (in this case duration, initial phone fundamental frequency, final phone fundamental frequency, and energy''' #phone_start = int(alignments[0] * fs) first_phone_start = alignments[0][0] first_phone_end = alignments[0][1] #print("START: " + str(phone_start)) #phone_end = int(alignments[1] * fs) #print("END: " + str(phone_end)) last_phone_start = alignments[-1][0] last_phone_end = alignments[-1][1] #print(phone_start) #print(phone_end) #print(utterance_wav) #print(len(utterance_wav)) duration = last_phone_end - first_phone_start first_phone_samples = utterance_wav[first_phone_start:first_phone_end] last_phone_samples = utterance_wav[last_phone_start:last_phone_end] all_phone_samples = utterance_wav[first_phone_start:last_phone_end] #phone_test = utterance_wav[phone_start] #try: ''' print(first_phone_start) print(first_phone_end) print(last_phone_start) print(last_phone_end) ''' try: f_0_init = np.mean(pysptk.swipe(first_phone_samples.astype(np.float64), fs=fs, hopsize=100, otype='f0')) #print(f_0_init) f_0_end = np.mean(pysptk.swipe(last_phone_samples.astype(np.float64), fs=fs, hopsize=100, otype='f0')) except: print("Outof bounds!!") print(utterance) print(first_phone_start) print(first_phone_end) print(last_phone_start) print(last_phone_end) print(alignments) raise Exception("Out of bounds") return (0, 0, 0, 0) #print(f_0_end) #except IndexError: # For "Index Error: Out of bounds on buffer access (axis 0) #mfcc = pysptk.mfcc(samples) #pitch = pysptk.swipe(phone_samples.astype(np.float64), fs=fs, hopsize=100, otype='pitch') #excitation = pysptk.excite(pitch) #excitation_mu = np.mean(excitation) #excitation_std = np.std(excitation) #print() energy = np.sum(np.square(all_phone_samples)) / duration return duration, f_0_init, f_0_end, energy
def eval_pitch(ted_audio_path, user_audio_path, png_save_path): sr, x = wavfile.read(ted_audio_path) # ted 목소리 assert sr == 16000 x = x.astype(np.float64) frame_length = 1024 hop_length = 80 f_you = pysptk.swipe(x.astype(np.float64), fs=sr, hopsize=hop_length, min=60, max=240, otype="f0") sr1, x1 = wavfile.read(user_audio_path) assert sr1 == 16000 x1 = x1.astype(np.float64) frame_length = 1024 hop_length = 80 # F0 estimation # 각 주파수에서 기본 주파수 뽑기 ,def strength 와 같은 과정. f_ted = pysptk.swipe(x1.astype(np.float64), fs=sr1, hopsize=hop_length, min=60, max=240, otype="f0") plt.figure(figsize=(20, 5)) ############## width = int(len(f_ted) / 22) # width 조정 해주기 width1 = int(len(f_you) / 22) if np.where(f_you >= 60)[0][0] > np.where( f_ted >= 60)[0][0]: #테드가 왼쪽에서 더 빠르게 시작하면? diff = np.where(f_you >= 60)[0][0] - np.where(f_ted >= 60)[0][0] zero_ = np.zeros(diff) new_f0 = np.r_[zero_, f_ted] cal0 = copy.copy(new_f0) cal0[np.where(cal0 < 75)] = 0 new_f0[np.where(new_f0 < 75)] = 0 value = [] loc = [] c3 = 0 i = 0 new_f0 = list(new_f0) while i < len(range(len(new_f0))): ten = new_f0[i:i + width] if ten == []: break a = max(ten) b = ten.index(a) value.append(a) loc.append(i + b) if i + width > len(new_f0): break else: c3 = c3 + 1 i = width * c3 base = np.empty(len(new_f0)) base.fill(np.nan) for i in range(len(loc)): location = loc[i] base[location] = value[i] df_blue = pd.DataFrame(base) plt.figure(figsize=(20, 5)) df_blue.interpolate(method='polynomial', order=2, linewidth=2, inplace=True) bbb = list(df_blue[0]) ccc = list(map(lambda x: 0 if x < 0 else x, bbb)) df_blue[0] = ccc df_blue.fillna(0, inplace=True) for g in range(len(df_blue)): if df_blue[0][g] > max(f_ted) * 1.2: df_blue[0][g] = max(f_ted) * 1.2 #합친게 you0 you0 = f_you you0[np.where(you0 < 75)] = 0 value1 = [] loc1 = [] c1 = 0 i1 = 0 you0 = list(you0) while i < len(range(len(you0))): ten1 = you0[i1:i1 + width1] if ten1 == []: break a1 = max(ten1) b1 = ten1.index(a1) value1.append(a1) loc1.append(i1 + b1) if i1 + width1 > len(you0): break else: c1 = c1 + 1 i1 = width1 * c1 base1 = np.empty(len(you0)) base1.fill(np.nan) for i in range(len(loc1)): location1 = loc1[i] base1[location1] = value1[i] df_red = pd.DataFrame(base1) df_red.interpolate(method='polynomial', order=2, linewidth=2, inplace=True) bbb1 = list(df_red[0]) ccc1 = list(map(lambda x: 0 if x < 0 else x, bbb1)) df_red[0] = ccc1 df_red.fillna(0, inplace=True) for h in range(len(df_red)): if df_red[0][h] > max(f_you) * 1.2: df_red[0][h] = max(f_you) * 1.2 df_red[0] = df_red[0] * max(f_ted) / max(f_you) area = [] diff_areas = [] if len(df_red[0]) > len(df_blue[0]): for i in range(diff, len(df_blue[0])): area.append(df_blue[0][i]) diff_areas.append(abs(df_red[0][i] - df_blue[0][i])) result = 1 - (sum(diff_areas / sum(area))) else: for i in range(diff, len(df_red[0])): area.append(df_blue[0][i]) diff_areas.append(abs(df_red[0][i] - df_blue[0][i])) result = 1 - (sum(diff_areas) / sum(area)) ranks = [] for i in range(1, len(df_blue[0]) - 1, 1): if df_blue[0][i] > 60: if df_blue[0][i] > df_blue[0][ i - 1] and df_blue[0][i] > df_blue[0][i + 1]: ranks.append(df_blue[0][i] / max(df_blue[0])) ranks1 = [] for i in range(1, len(df_red[0]) - 1, 1): if df_red[0][i] > 60: if df_red[0][i] > df_red[0][ i - 1] and df_red[0][i] > df_red[0][i + 1]: ranks1.append(df_red[0][i] / max(df_red[0])) diffrent = [] if len(ranks) > len(ranks1): for i in range(len(ranks1)): diffrent.append(abs(ranks1[i] - ranks[i])) else: for i in range(len(ranks)): diffrent.append(abs(ranks1[i] - ranks[i])) points4 = 1 - (sum(diffrent) / sum(ranks)) else: # 테드가 더늦겟시작 diff = np.where(f_ted >= 60)[0][0] - np.where(f_you >= 60)[0][0] zero_ted = np.zeros(diff) new_ted = np.r_[zero_ted, f_you] new_ted[np.where(new_ted < 75)] = 0 value1 = [] loc1 = [] c1 = 0 i1 = 0 new_ted = list(new_ted) while i1 < len(range(len(new_ted))): ten1 = new_ted[i1:i1 + width1] if ten1 == []: break a1 = max(ten1) b1 = ten1.index(a1) value1.append(a1) loc1.append(i1 + b1) if i1 + width1 > len(new_ted): break else: c1 = c1 + 1 i1 = width1 * c1 base = np.empty(len(new_ted)) base.fill(np.nan) for h in range(len(loc1)): location1 = loc1[h] base[location1] = value1[h] df_blue = pd.DataFrame(base) df_blue.interpolate(method='polynomial', order=2, linewidth=2, inplace=True) bbb = list(df_blue[0]) ccc = list(map(lambda x: 0 if x < 0 else x, bbb)) df_blue[0] = ccc df_blue.fillna(0, inplace=True) for g in range(len(df_blue)): if df_blue[0][g] > max(f_you) * 1.2: df_blue[0][g] = max(f_you) * 1.2 f_ted1 = f_ted cal2 = copy.copy(f_ted1) cal2[np.where(f_ted1 < 75)] = 0 f_ted1[np.where(f_ted1 < 75)] = 0 value2 = [] loc2 = [] c2 = 0 i2 = 0 f_ted1 = list(f_ted1) while i2 < len(range(len(f_ted1))): ten2 = f_ted1[i2:i2 + width] if ten2 == []: break a2 = max(ten2) b2 = ten2.index(a2) value2.append(a2) loc2.append(i2 + b2) if i2 + width > len(f_ted1): break else: c2 = c2 + 1 i2 = width * c2 base2 = np.empty(len(f_ted1)) base2.fill(np.nan) for i in range(len(loc2)): location2 = loc2[i] base2[location2] = value2[i] df_red = pd.DataFrame(base2) df_red.interpolate(method='polynomial', order=2, linewidth=2, inplace=True) bbb2 = list(df_red[0]) ccc2 = list(map(lambda x: 0 if x < 0 else x, bbb2)) df_red[0] = ccc2 df_red.fillna(0, inplace=True) for g in range(len(df_red)): if df_red[0][g] > max(f_ted) * 1.2: df_red[0][g] = max(f_ted) * 1.2 df_blue[0] = df_blue[0] * max(f_ted) / max(f_you) area = [] diff_areas = [] if len(df_red[0]) > len(df_blue[0]): for i in range(diff, len(df_blue[0])): area.append(df_red[0][i]) diff_areas.append(abs(df_red[0][i] - df_blue[0][i])) result = 1 - (sum(diff_areas / sum(area))) else: for i in range(diff, len(df_red[0])): area.append(df_red[0][i]) diff_areas.append(abs(df_red[0][i] - df_blue[0][i])) result = 1 - (sum(diff_areas) / sum(area)) ranks = [] for i in range(1, len(df_blue[0]) - 1, 1): if df_blue[0][i] > 60: if df_blue[0][i] > df_blue[0][ i - 1] and df_blue[0][i] > df_blue[0][i + 1]: ranks.append(df_blue[0][i] / max(df_blue[0])) ranks1 = [] for i in range(1, len(df_red[0]) - 1, 1): if df_red[0][i] > 60: if df_red[0][i] > df_red[0][ i - 1] and df_red[0][i] > df_red[0][i + 1]: ranks1.append(df_red[0][i] / max(df_red[0])) diffrent = [] if len(ranks) > len(ranks1): for i in range(len(ranks1)): diffrent.append(abs(ranks1[i] - ranks[i])) else: for i in range(len(ranks)): diffrent.append(abs(ranks1[i] - ranks[i])) points4 = 1 - (sum(diffrent) / sum(ranks1)) result = int(result * 100) result1 = int(points4 * 100) pitch_result_rate = max(result, result1) global pitch_result if pitch_result_rate >= 85: pitch_result = 'Excellent' elif pitch_result_rate >= 65: pitch_result = 'Good' else: pitch_result = 'Bad' line1, = plt.plot(df_blue, color='navy', linewidth=5) line2, = plt.plot(df_red, color='crimson', linewidth=5) plt.title('Pitch Result', fontsize=50) plt.legend(handles=(line1, line2), labels=('Ted', 'You'), fontsize=20) plt.ylabel('Pitch', fontsize=20) plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) plt.savefig(png_save_path + 'pitch_result.png') return pitch_result_rate, pitch_result
def __test(x, fs, hopsize, otype): f0 = pysptk.swipe(x, fs, hopsize, otype=otype) assert np.all(np.isfinite(f0))
def __call__(self, x): pitch = pysptk.swipe(x, fs=self.sr, hopsize=self.hop_length, min=self.f0_floor, max=self.f0_ceil, otype="pitch") f0 = pysptk.swipe(x, fs=self.sr, hopsize=self.hop_length, min=self.f0_floor, max=self.f0_ceil, otype="f0") return pitch, f0
def pitch_detect(sndarray,fs, chunk_size): new_sndarray = numpy.asarray(numpy.float64(sndarray)) f0 = pysptk.swipe(numpy.asarray(new_sndarray), fs, chunk_size, 65,500,0.001,1) # Fundamental Frequency return f0
duration = 0 file = "" if (i <= 9): file = dir_path + "chunks/chunk-0" + str(i) + ".wav" else: file = dir_path + "chunks/chunk-" + str(i) + ".wav" duration = get_duration(file) fs, x = wavfile.read(file) assert fs == 16000 f0_swipe = pysptk.swipe(x.astype(np.float64), fs=fs, hopsize=80, min=60, otype="f0") a = [] f = [] X_Frequecies_Vector = [] for w in f0_swipe: if w != 0: f.append(w) if len(f) >= 30: f = random.sample(f, 30) else: f += [0] * (30 - len(f)) freq_matrix.append(f) #a = np.var(f)
def processingVideo(): for i in range(66,67): dir_path = "/home/eduardo/data_base_www/2PhaT6AbH3Q/" #move_files() num_emphasys = [] n_chunks = len(glob.glob(dir_path +"chunks/chunk*")) freq_matrix = [[0 for i in range(2)] for j in range(n_chunks)] for i in range(0, n_chunks): duration = 0 file ="" if(i <=9): file = dir_path+"chunks/chunk-0"+str(i)+".wav" else: file = dir_path+"chunks/chunk-"+str(i)+".wav" duration = get_duration(file) fs, x = wavfile.read(file) assert fs == 16000 f0_swipe = pysptk.swipe(x.astype(np.float64), fs = fs, hopsize = 20, min=60) a = [] f = [] X_Frequecies_Vector = [] for w in f0_swipe: if w != 0: f.append(w) #if not f: # a = 0 #else: # a = stats.mode(f)[0][0] a = np.mean(f) #print(len(f)) c, Pxx_den = signal.welch(x, fs, nperseg=1024) #if(Pxx_den.any()): # v = 0 ## v = stats.mode(Pxx_den)[0][0] v = np.mean(Pxx_den) if(~np.isnan(v) and ~np.isnan(a)): #num_emphasys.append ( v ) l = [] l.append(a) l.append(v) freq_matrix[i] = l weightA = [0 for i in range(n_chunks)] weightT = [0 for i in range(n_chunks)] transcript_matrix, annotation_matrix, avg_depth = cs.computeMatrix(dir_path, n_chunks) best_model_silhouettte = -1000 iterations_without_improvment = 0 max = 32 model2 = cluster.SpectralClustering(max, affinity='precomputed', n_init=10000, n_jobs=-1) cluster_labels = model2.fit_predict(transcript_matrix) for i in range(len(cluster_labels)-1): if(cluster_labels[i] != cluster_labels[i+1]): weightT[i+1] = np.sqrt(pow(freq_matrix[i+1][0],2) + pow(freq_matrix[i+1][1],2)) for j in range(len(annotation_matrix) -1): if(not set(annotation_matrix[j]).intersection(annotation_matrix[j+1])): weightA[j+1] = float(np.sqrt(pow(freq_matrix[j+1][0],2) + pow(freq_matrix[j+1][1],2)) /abs(avg_depth[j] - avg_depth[j+1])) #rankingT = sorted(range(len(weightT)), key=lambda x: weightT[x])[-70:] rankingA = sorted(range(len(weightA)), key=lambda x: weightA[x])[-70:] #ranking = list(set(rankingT).intersection(rankingA)) #merged = sorted(list(set(ranking).union(rankingA))) #matrixT = np.array(transcript_matrix) #matrixA = np.array(annotation_matrix) #ranking = getranking(n_chunks, freq_matrix, matrixT, matrixA, 25) evaluate_method.evaluate(dir_path, sorted(rankingA), "aas")
def get_pitch_pysptk(wav): sample_rate, samples = wavfile.read(wav) f0_swipe = pysptk.swipe(samples.astype(np.float64), fs=sample_rate, hopsize=80, min=60, max=200, otype="f0") return f0_swipe
def read_audio_n_process(file, label, base_path, sampling_rate, sample_size_in_seconds, overlap, normalise, method): """ This method is called by the preprocess data method :param file: :param label: :param base_path: :param sampling_rate: :param sample_size_in_seconds: :param overlap: :param normalise: :return: """ data, out_labels = [], [] filepath = base_path + file if os.path.exists(filepath): audio, sr = librosa.load(filepath, sr=sampling_rate) # mask = envelope(audio, sr, 0.0005) # audio = audio[mask] sr = sampling_rate # audio = remove_silent_parts(filepath, sr=sampling_rate) chunks = cut_audio(audio, sampling_rate=sr, sample_size_in_seconds=sample_size_in_seconds, overlap=overlap) for chunk in chunks: if method == 'fbank': zero_crossing = librosa.feature.zero_crossing_rate(chunk) f0 = pysptk.swipe(chunk.astype(np.float64), fs=sr, hopsize=510, min=60, max=240, otype="f0").reshape(1, -1) pitch = pysptk.swipe(chunk.astype(np.float64), fs=sr, hopsize=510, min=60, max=240, otype="pitch").reshape(1, -1) f0_pitch_multiplier = 1 features = mel_filters(chunk, sr, normalise) f0 = np.reshape(f0[:, :features.shape[1] * f0_pitch_multiplier], newshape=(f0_pitch_multiplier, -1)) pitch = np.reshape(pitch[:, :features.shape[1] * f0_pitch_multiplier], newshape=(f0_pitch_multiplier, -1)) # shimmer_jitter = get_shimmer_jitter_from_opensmile(chunk, time.time(), sr) # shimmer_jitter = np.tile(shimmer_jitter, math.ceil(features.shape[-1] / len(shimmer_jitter)))[ # :features.shape[ # -1]] # Repeating the values to match the features length of filterbanks # shimmer_jitter = np.reshape(shimmer_jitter, newshape=(1, -1)) features = np.concatenate((features, zero_crossing, f0, pitch), axis=0) # shimmer_jitter elif method == 'mfcc': features = mfcc_features(chunk, sr, normalise) elif method == 'gaf': features = gaf(chunk) elif method == 'raw': features = chunk else: raise Exception( 'Specify a method to use for pre processing raw audio signal. Available options - {fbank, mfcc, gaf, raw}' ) data.append(features) out_labels.append(float(label)) return data, out_labels, chunks else: print('File not found ', filepath) return [], [], []
def get_f0(waveform, sample_rate, hop_length_seconds=0.01, method='swipe', f0_min=60, f0_max=300): """Compute the F0 contour using PYSPTK: https://github.com/r9y9/pysptk/. Args: waveform (np.array, [T, ]): waveform over which to compute f0 sample_rate (int > 0): number of samples per second in waveform hop_length (int): hop size argument in pysptk.swipe. Corresponds to hopsize in the window sliding of the computation of f0. method (str): is one of 'swipe' or 'rapt'. Define which method to use for f0 calculation. See https://github.com/r9y9/pysptk Returns: dict: Dictionary containing keys: "contour" (np.array, [1, t1]): f0 contour of waveform. Contains unvoiced frames. "values" (np.array, [1, t2]): nonzero f0 values waveform. Note that this discards all unvoiced frames. Use to compute mean, std, and other statistics. "mean" (float): mean of the f0 contour. "std" (float): standard deviation of the f0 contour. """ assert method in ( 'swipe', 'rapt'), "The method argument should be one of 'swipe' or 'rapt'." hop_length = numseconds_to_numsamples(hop_length_seconds, sample_rate) if method == 'swipe': f0_contour = swipe( waveform.astype(np.float64), fs=sample_rate, hopsize=hop_length, min=f0_min, max=f0_max, otype="f0", )[np.newaxis, :] elif method == 'rapt': # For this estimation, waveform needs to be in the int PCM format. f0_contour = rapt( np.round(waveform * 32767).astype(np.float32), fs=sample_rate, hopsize=hop_length, min=f0_min, max=f0_max, otype="f0", )[np.newaxis, :] # Remove unvoiced frames. f0_values = f0_contour[:, np.where(f0_contour[0, :] != 0)][0] f0_mean = np.mean(f0_values[0]) f0_std = np.std(f0_values[0]) return { "contour": f0_contour, "values": f0_values, "mean": f0_mean, "std": f0_std, }
def __call__(self, tensor): """ Args: tensor (Tensor): Tensor of audio of size (samples x 1) """ # pysptk and interpolate are a MUST in this transform import pysptk from ahoproc_tools.interpolate import interpolation t_npy = tensor.cpu().squeeze(1).numpy() #print('t_npy shape: ', t_npy.shape) seqlen = t_npy.shape[0] T = seqlen // self.hop_length # compute LF0 and UV f0 = pysptk.swipe(t_npy.astype(np.float64), fs=self.sr, hopsize=self.hop_length, min=60, max=240, otype="f0")[:T] lf0 = np.log(f0 + 1e-10) lf0, uv = interpolation(lf0, -1) if np.any(lf0 == np.log(1e-10)): # all lf0 goes to minf0 as a PAD symbol lf0 = np.ones(lf0.shape) * np.log(60) # all frames are unvoiced uv = np.zeros(uv.shape) ret = { 'lf0': torch.FloatTensor(lf0).view(-1, 1), 'uv': torch.FloatTensor(uv.astype(np.float32)).view(-1, 1) } tot_frames = T # MelSpectrum and MFCCs mel = self.mel(tensor).transpose(0, 1).squeeze(2) # do compression? if self.dynamic_norm_spec: mel = torch.log1p(mel * 10000) / torch.log(torch.FloatTensor([10])) ret['mel_spec'] = mel[:tot_frames] mfcc = librosa.feature.mfcc(y=t_npy, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mfcc=self.mfcc_order).T mfcc = mfcc[:tot_frames] ret['mfcc'] = torch.FloatTensor(mfcc) # Spectrogram abs magnitude [dB] spec = librosa.stft(t_npy, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window) spec_db = librosa.amplitude_to_db(spec).T spec_ang = np.angle(spec).T spec_db = spec_db[:tot_frames] spec_ang = spec_ang[:tot_frames] ret['mag'] = torch.FloatTensor(spec_db) ret['pha'] = torch.FloatTensor(spec_ang) # ZCR, E and lF0 egy = librosa.feature.rmse(y=t_npy, frame_length=self.win_length, hop_length=self.hop_length, pad_mode='constant').T egy = egy[:tot_frames] zcr = librosa.feature.zero_crossing_rate(y=t_npy, frame_length=self.win_length, hop_length=self.hop_length).T zcr = zcr[:tot_frames] ret['egy'] = torch.FloatTensor(egy) ret['zcr'] = torch.FloatTensor(zcr) ntensor = tensor.clone() if hasattr(self, 'chopper'): do_chop = random.random() > 0.5 if do_chop: ntensor = self.chopper(ntensor, self.sr) if hasattr(self, 'additive'): do_add = random.random() > 0.5 if do_add: ntensor = self.additive(ntensor.numpy(), self.sr) if hasattr(self, 'clipping'): do_clip = random.random() > 0.5 if do_clip: ntensor = self.clipping(ntensor.numpy()) ret['wav'] = ntensor.view((-1, 1)) ret['cwav'] = tensor.view((-1, 1)) return ret