fout = open(out_file, 'wb') skip = order + 1 for c in range(0, nb_frames): cfeat = enc.predict([features[c:c+1, :, :nb_used_features], periods[c:c+1, :, :]]) for fr in range(0, feature_chunk_size): f = c*feature_chunk_size + fr a = features[c, fr, nb_features-order:] for i in range(skip, frame_size): pred = -sum(a*pcm[f*frame_size + i - 1:f*frame_size + i - order-1:-1]) fexc[0, 0, 1] = lin2ulaw(pred) p, state1, state2 = dec.predict([fexc, iexc, cfeat[:, fr:fr+1, :], state1, state2]) #Lower the temperature for voiced frames to reduce noisiness p *= np.power(p, np.maximum(0, 1.5*features[c, fr, 37] - .5)) p = p/(1e-18 + np.sum(p)) #Cut off the tail of the remaining distribution p = np.maximum(p-0.002, 0).astype('float64') p = p/(1e-8 + np.sum(p)) iexc[0, 0, 0] = np.argmax(np.random.multinomial(1, p[0,0,:], 1)) pcm[f*frame_size + i] = pred + ulaw2lin(iexc[0, 0, 0]) fexc[0, 0, 0] = lin2ulaw(pcm[f*frame_size + i]) mem = coef*mem + pcm[f*frame_size + i] #print(mem) np.array([np.round(mem)], dtype='int16').tofile(fout) skip = 0
fout = open(out_file, 'wb') skip = order + 1 for c in range(0, nb_frames): cfeat = enc.predict( [features[c:c + 1, :, :nb_used_features], periods[c:c + 1, :, :]]) for fr in range(0, feature_chunk_size): f = c * feature_chunk_size + fr a = features[c, fr, nb_features - order:] for i in range(skip, frame_size): pred = -sum(a * pcm[f * frame_size + i - 1:f * frame_size + i - order - 1:-1]) fexc[0, 0, 1] = lin2ulaw(pred) p, state1, state2 = dec.predict( [fexc, cfeat[:, fr:fr + 1, :], state1, state2]) #Lower the temperature for voiced frames to reduce noisiness p *= np.power(p, np.maximum(0, 1.5 * features[c, fr, 37] - .5)) p = p / (1e-18 + np.sum(p)) #Cut off the tail of the remaining distribution p = np.maximum(p - 0.002, 0).astype('float64') p = p / (1e-8 + np.sum(p)) fexc[0, 0, 2] = np.argmax(np.random.multinomial(1, p[0, 0, :], 1)) pcm[f * frame_size + i] = pred + ulaw2lin(fexc[0, 0, 2]) fexc[0, 0, 0] = lin2ulaw(pcm[f * frame_size + i]) mem = coef * mem + pcm[f * frame_size + i] #print(mem) np.array([np.round(mem)], dtype='int16').tofile(fout) skip = 0
]) #noise = np.round(np.concatenate([np.zeros((len(data)*1//5)), np.random.laplace(0, 1.2, len(data)*1//5), np.random.laplace(0, .77, len(data)*1//5), np.random.laplace(0, .33, len(data)*1//5), np.random.randint(-1, 1, len(data)*1//5)])) in_data = in_data + noise in_data = np.clip(in_data, 0, 255) features = np.reshape(features, (nb_frames * feature_chunk_size, nb_features)) # Note: the LPC predictor output is now calculated by the loop below, this code was # for an ealier version that implemented the prediction filter in C upred = np.zeros((nb_frames * pcm_chunk_size, ), dtype='int16') # Use 16th order LPC to generate LPC prediction output upred[] and (in # mu-law form) pred[] pred_in = ulaw2lin(in_data) for i in range(2, nb_frames * feature_chunk_size): upred[i * frame_size:(i + 1) * frame_size] = 0 for k in range(16): upred[i*frame_size:(i+1)*frame_size] = upred[i*frame_size:(i+1)*frame_size] - \ pred_in[i*frame_size-k:(i+1)*frame_size-k]*features[i, nb_features-16+k] pred = lin2ulaw(upred) in_data = np.reshape(in_data, (nb_frames, pcm_chunk_size, 1)) in_data = in_data.astype('uint8') # LPC residual, which is the difference between the input speech and # the predictor output, with a slight time shift this is also the # ideal excitation in_exc
parser.add_argument('--nb_samples', type=int, default=-1, help='Optional number of samples to plot') args = parser.parse_args() data = np.fromfile(args.file1, dtype='uint8') nb_samples = args.nb_samples data = data[:nb_samples] sig = np.array(data[0::4], dtype='float') pred = np.array(data[1::4], dtype='float') in_exc = np.array(data[2::4], dtype='float') out_exc = np.array(data[3::4], dtype='float') print("exc var: %4.3e" % (np.var(ulaw.ulaw2lin(in_exc)))) plt.figure(1) plt.subplot(211) plt.plot(ulaw.ulaw2lin(sig), label='sig') plt.ylim((-30000, 30000)) plt.legend() plt.subplot(212) plt.plot(ulaw.ulaw2lin(pred), label='pred') plt.ylim((-30000, 30000)) plt.legend() plt.show(block=False) plt.figure(2) plt.subplot(211) plt.plot(ulaw.ulaw2lin(in_exc), label='in_exc')
def synthesis(args, hparams): model = LPCNet(hparams).cuda() feature_file = args.feature_file; out_file = args.out_file frame_size = hparams.frame_size nb_features = hparams.nb_features features = np.fromfile(feature_file, dtype='float32') features = features.reshape(-1, nb_features) #features = np.resize(features, (-1, nb_features)) #使用resize会导致最后一行数据丢失 nb_frames = 1 feature_chunk_size = features.shape[0] pcm_chunk_size = frame_size * feature_chunk_size features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features)) periods = (.1 + 50*features[:,:,hparams.pitch_idx:hparams.pitch_idx+1]+100).astype('int16') if None == hparams.checkpoint_file or not os.path.isfile(hparams.checkpoint_file): return checkpoint_dict = torch.load(hparams.checkpoint_file) model.load_state_dict(checkpoint_dict['state_dict']) #model_init(model) model.eval() order = 16 pcm = np.zeros((nb_frames * pcm_chunk_size,)) fexc = np.zeros((1, 1, 2), dtype='float32') iexc = np.zeros((1, 1, 1), dtype='int16') state1 = torch.Tensor(np.zeros((1, 1, hparams.rnn_units1), dtype='float32')).cuda() state2 = torch.Tensor(np.zeros((1, 1, hparams.rnn_units2), dtype='float32')).cuda() mem = 0 coef = 0.85 fout = open(out_file, "wb") skip = order + 1 for c in range(0, nb_frames): cfeat = model.encoder(features[c:c+1, :, :nb_features], periods[c:c+1, :, :]) fexc[0, 0, 0] = 128 # 0 mulaw iexc[0, 0, 0] = 128 for fr in range(0, feature_chunk_size): f = c * feature_chunk_size + fr a = features[c, fr, nb_features - order:] for i in range(skip, frame_size): pred = -sum(a*pcm[f*frame_size + i - 1:f*frame_size + i - order-1:-1]) fexc[0, 0, 1] = lin2ulaw(pred) p_tensor, state1, state2 = model.decoder(fexc, iexc, cfeat[:, fr:fr+1, :], state1, state2) p = p_tensor.clone().cpu().detach().numpy() # Lower the temperature for voiced frames to reduce noisiness p *= np.power(p, np.maximum(0, 1.5 * features[c, fr, hparams.pitch_idx+1] - .5)) p = p / (1e-18 + np.sum(p)) # Cut off the tail of the remaining distribution p = np.maximum(p - 0.002, 0).astype('float64') p = p / (1e-8 + np.sum(p)) iexc[0, 0, 0] = np.argmax(np.random.multinomial(1, p[0, 0, :], 1)) pcm[f * frame_size + i] = pred + ulaw2lin(iexc[0, 0, 0]) fexc[0, 0, 0] = lin2ulaw(pcm[f * frame_size + i]) mem = coef * mem + pcm[f * frame_size + i] # print(mem) np.array([np.round(mem)], dtype='int16').tofile(fout) skip = 0