def generate_and_save_samples(tag): def write_audio_file(name, data): data = data.astype('float32') data -= data.min() data /= data.max() data -= 0.5 data *= 0.95 scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'), BITRATE, data) total_time = time() # Generate N_SEQS' sample files, each 5 seconds long N_SECS = 5 LENGTH = N_SECS * BITRATE if not args.debug else 100 samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') samples[:, :BIG_FRAME_SIZE] = Q_ZERO # First half zero, others fixed random at each checkpoint big_h0 = numpy.zeros( (N_SEQS - fixed_rand_big_h0.shape[0], N_BIG_RNN, H0_MULT * BIG_DIM), dtype='float32') big_h0 = numpy.concatenate((big_h0, fixed_rand_big_h0), axis=0) h0 = numpy.zeros((N_SEQS - fixed_rand_h0.shape[0], N_RNN, H0_MULT * DIM), dtype='float32') h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0) big_frame_level_outputs = None frame_level_outputs = None for t in xrange(BIG_FRAME_SIZE, LENGTH): if t % BIG_FRAME_SIZE == 0: big_frame_level_outputs, big_h0 = big_frame_level_generate_fn( samples[:, t - BIG_FRAME_SIZE:t], big_h0, numpy.int32(t == BIG_FRAME_SIZE)) if t % FRAME_SIZE == 0: frame_level_outputs, h0 = frame_level_generate_fn( samples[:, t - FRAME_SIZE:t], big_frame_level_outputs[:, (t / FRAME_SIZE) % (BIG_FRAME_SIZE / FRAME_SIZE)], h0, numpy.int32(t == BIG_FRAME_SIZE)) samples[:, t] = sample_level_generate_fn( frame_level_outputs[:, t % FRAME_SIZE], samples[:, t - FRAME_SIZE:t]) total_time = time() - total_time log = "{} samples of {} seconds length generated in {} seconds." log = log.format(N_SEQS, N_SECS, total_time) print log, for i in xrange(N_SEQS): samp = samples[i] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag): def write_audio_file(name, data): data = data.astype('float32') data -= data.min() data /= data.max() data -= 0.5 data *= 0.95 scipy.io.wavfile.write( os.path.join(SAMPLES_PATH, name+'.wav'), BITRATE, data) total_time = time() # Generate N_SEQS' sample files, each 5 seconds long N_SECS = 5 LENGTH = N_SECS*BITRATE if not args.debug else 100 samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') samples[:, :FRAME_SIZE] = Q_ZERO # First half zero, others fixed random at each checkpoint h0 = numpy.zeros( (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM), dtype='float32' ) h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0) frame_level_outputs = None for t in xrange(FRAME_SIZE, LENGTH): if t % FRAME_SIZE == 0: frame_level_outputs, h0 = frame_level_generate_fn( samples[:, t-FRAME_SIZE:t], h0, #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'), numpy.int32(t == FRAME_SIZE) ) samples[:, t] = sample_level_generate_fn( frame_level_outputs[:, t % FRAME_SIZE], samples[:, t-FRAME_SIZE:t], ) total_time = time() - total_time log = "{} samples of {} seconds length generated in {} seconds." log = log.format(N_SEQS, N_SECS, total_time) print log, for i in xrange(N_SEQS): samp = samples[i] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag, N_SECS=5): def write_audio_file(name, data): data = data.astype('float32') data -= data.min() data /= data.max() data -= 0.5 data *= 0.95 scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'), BITRATE, data) total_time = time() # Generate N_SEQS' sample files, each 5 seconds long LENGTH = N_SECS * BITRATE if not args.debug else 100 samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') samples[:, :FRAME_SIZE] = Q_ZERO # First half zero, others fixed random at each checkpoint h0 = numpy.zeros((N_SEQS - fixed_rand_h0.shape[0], N_RNN, H0_MULT * DIM), dtype='float32') h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0) frame_level_outputs = None for t in xrange(FRAME_SIZE, LENGTH): if t % FRAME_SIZE == 0: frame_level_outputs, h0 = frame_level_generate_fn( samples[:, t - FRAME_SIZE:t], h0, #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'), numpy.int32(t == FRAME_SIZE)) samples[:, t] = sample_level_generate_fn( frame_level_outputs[:, t % FRAME_SIZE], samples[:, t - FRAME_SIZE:t], ) total_time = time() - total_time log = "{} samples of {} seconds length generated in {} seconds." log = log.format(N_SEQS, N_SECS, total_time) print log for i in xrange(N_SEQS): samp = samples[i] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') now = datetime.datetime.now() now_time = "{}:{}:{}".format(now.hour, now.minute, now.second) file_name = "sample_{}_{}_{}_{}".format(tag, N_SECS, now_time, i) print "writing...", file_name write_audio_file(file_name, samp)
def generate_and_save_samples(tag): def write_audio_file(name, data): data = data.astype('float32') data -= data.min() data /= data.max() data -= 0.5 data *= 0.95 scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'), BITRATE, data) total_time = time.time() # Generate N_SEQS' sample files, each 5 seconds long N_SECS = 5 LENGTH = N_SECS * BITRATE if args.debug: LENGTH = 1024 num_prev_samples_to_use = (2**args.dilation_layers_per_block - 1) * args.wavenet_blocks + 1 samples = numpy.zeros((N_SEQS, LENGTH + num_prev_samples_to_use), dtype='int32') samples[:, :num_prev_samples_to_use] = Q_ZERO for t in range(LENGTH): samples[:, num_prev_samples_to_use + t:num_prev_samples_to_use + t + 1] = generate_fn(samples[:, t:t + num_prev_samples_to_use + 1]) if (t > 2 * BITRATE) and (t < 3 * BITRATE): samples[:, num_prev_samples_to_use + t:num_prev_samples_to_use + t + 1] = Q_ZERO total_time = time.time() - total_time log = "{} samples of {} seconds length generated in {} seconds." log = log.format(N_SEQS, N_SECS, total_time) print log, for i in xrange(N_SEQS): samp = samples[i, num_prev_samples_to_use:] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag, samples): def write_audio_file(name, data): data = data.astype('float32') data -= data.min() data /= data.max() data -= 0.5 data *= 0.95 scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'), BITRATE, data) for i in xrange(N_SEQS): samp = samples[i] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag): def write_audio_file(name, data): data = data.astype('float32') data -= data.min() data /= data.max() data -= 0.5 data *= 0.95 scipy.io.wavfile.write( os.path.join(SAMPLES_PATH, name+'.wav'), BITRATE, data) total_time = time.time() # Generate N_SEQS' sample files, each 5 seconds long N_SECS = 5 LENGTH = N_SECS*BITRATE if args.debug: LENGTH = 1024 num_prev_samples_to_use = (2**args.dilation_layers_per_block - 1)*args.wavenet_blocks + 1 samples = numpy.zeros((N_SEQS, LENGTH + num_prev_samples_to_use), dtype='int32') samples[:, :num_prev_samples_to_use] = Q_ZERO for t in range(LENGTH): samples[:,num_prev_samples_to_use+t:num_prev_samples_to_use+t+1] = generate_fn(samples[:, t:t + num_prev_samples_to_use+1]) if (t > 2*BITRATE) and( t < 3*BITRATE): samples[:,num_prev_samples_to_use+t:num_prev_samples_to_use+t+1] = Q_ZERO total_time = time.time() - total_time log = "{} samples of {} seconds length generated in {} seconds." log = log.format(N_SEQS, N_SECS, total_time) print log, for i in xrange(N_SEQS): samp = samples[i, num_prev_samples_to_use: ] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag): def write_audio_file(name, data): data = data.astype('float32') data -= data.min() data /= data.max() data -= 0.5 data *= 0.95 scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'), BITRATE, data) total_time = time() # Generate N_SEQS' sample files, each 5 seconds long N_SECS = 5 LENGTH = N_SECS * BITRATE if not args.debug else 100 #op1: init with zero samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') samples[:, :FRAME_SIZE] = Q_ZERO if FLAG_USETRAIN_WHENTEST: print('') print('REMINDER: using training data for test') print('') testData_feeder = load_data_gen(train_feeder, LENGTH) else: testData_feeder = load_data_gen(test_feeder, LENGTH) mini_batch = testData_feeder.next() _, _, _, seqs_lab = mini_batch samples_lab = seqs_lab[:N_SEQS] #op2: init with true data #testData_feeder = load_data_gen(train_feeder,LENGTH+LAB_SIZE) #testData_feeder = load_data_gen(test_feeder,LENGTH+LAB_SIZE) #mini_batch = testData_feeder.next() #seqs, _, _, seqs_lab = mini_batch #samples = seqs[:N_SEQS,FRAME_SIZE:FRAME_SIZE+LENGTH] #samples_lab = seqs_lab[:N_SEQS,1:] # First half zero, others fixed random at each checkpoint h0 = numpy.zeros((N_SEQS, N_RNN, H0_MULT * DIM), dtype='float32') frame_level_outputs = None for t in xrange(FRAME_SIZE, LENGTH): if t % FRAME_SIZE == 0: tmp = samples_lab[:, (t - FRAME_SIZE) // FRAME_SIZE, :] tmp = tmp.reshape(tmp.shape[0], 1, tmp.shape[1]) frame_level_outputs, h0 = frame_level_generate_fn( samples[:, t - FRAME_SIZE:t], tmp, h0, #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'), numpy.int32(t == FRAME_SIZE)) samples[:, t] = sample_level_generate_fn( frame_level_outputs[:, t % FRAME_SIZE], samples[:, t - FRAME_SIZE:t], ) total_time = time() - total_time log = "{} samples of {} seconds length generated in {} seconds." log = log.format(N_SEQS, N_SECS, total_time) print log, for i in xrange(N_SEQS): samp = samples[i] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(): # Sampling at frame level frame_level_generate_fn = theano.function( [sequences, h0, reset], frame_level_rnn(sequences, h0, reset), on_unused_input='warn' ) def write_audio_file(name, data): data = data.astype('float32') data -= data.min() data /= data.max() data -= 0.5 data *= 0.95 scipy.io.wavfile.write( os.path.join(SAMPLES_PATH, name+'.wav'), BITRATE, data) total_time = time() # Generate N_SEQS' sample files, each 5 seconds long N_SECS = 5 LENGTH = N_SECS*BITRATE samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') samples[:, :FRAME_SIZE] = Q_ZERO # First half zero, others fixed random at each checkpoint h0 = numpy.zeros( (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM), dtype='float32' ) h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0) frame_level_outputs = None for t in xrange(FRAME_SIZE, LENGTH): if t % FRAME_SIZE == 0: frame_level_outputs, h0 = frame_level_generate_fn( samples[:, t-FRAME_SIZE:t], h0, #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'), numpy.int32(t == FRAME_SIZE) ) samples[:, t] = sample_level_generate_fn( frame_level_outputs[:, t % FRAME_SIZE], samples[:, t-FRAME_SIZE:t], ) total_time = time() - total_time log = "{} samples of {} seconds length generated in {} seconds." log = log.format(N_SEQS, N_SECS, total_time) print log, for i in xrange(N_SEQS): samp = samples[i] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag): def write_audio_file(name, data): data = data.astype('float32') data -= numpy.mean(data) data /= numpy.absolute(data).max() # [-1,1] data *= 32768 data = data.astype('int16') scipy.io.wavfile.write( os.path.join(SAMPLES_PATH, name+'.wav'), BITRATE, data) total_time = time() # Generate N_SEQS' sample files, each 5 seconds long N_SECS = 5 LENGTH = N_SECS*BITRATE if not args.debug else 100 samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') if flag_dict['RMZERO']: testData_feeder = load_data(test_feeder) mini_batch = testData_feeder.next() tmp, _, _ = mini_batch samples[:, :BIG_FRAME_SIZE] = tmp[:N_SEQS, :BIG_FRAME_SIZE] else: samples[:, :BIG_FRAME_SIZE] = Q_ZERO # First half zero, others fixed random at each checkpoint big_h0 = numpy.zeros( (N_SEQS-fixed_rand_big_h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM), dtype='float32' ) big_h0 = numpy.concatenate((big_h0, fixed_rand_big_h0), axis=0) h0_1 = numpy.zeros( (N_SEQS-fixed_rand_h0_1.shape[0], N_RNN_LIST[1], H0_MULT*DIM), dtype='float32' ) h0_1 = numpy.concatenate((h0_1, fixed_rand_h0_1), axis=0) h0_2 = numpy.zeros( (N_SEQS-fixed_rand_h0_2.shape[0], N_RNN_LIST[2], H0_MULT*DIM), dtype='float32' ) h0_2 = numpy.concatenate((h0_2, fixed_rand_h0_2), axis=0) big_frame_level_outputs = None frame_level_outputs_1 = None frame_level_outputs_2 = None for t in xrange(BIG_FRAME_SIZE, LENGTH): if t % BIG_FRAME_SIZE == 0: big_frame_level_outputs, big_h0 = big_frame_level_generate_fn( samples[:, t-BIG_FRAME_SIZE:t], big_h0, numpy.int32(t == BIG_FRAME_SIZE) ) if t % FRAME_SIZE_1 == 0: frame_level_outputs_1, h0_1 = frame_level_generate_fn_1( samples[:, t-FRAME_SIZE_1:t], big_frame_level_outputs[:, (t / FRAME_SIZE_1) % (BIG_FRAME_SIZE / FRAME_SIZE_1)], h0_1, numpy.int32(t == BIG_FRAME_SIZE) ) if t % FRAME_SIZE_2 == 0: frame_level_outputs_2, h0_2 = frame_level_generate_fn_2( samples[:, t-FRAME_SIZE_2:t], frame_level_outputs_1[:, (t / FRAME_SIZE_2) % (FRAME_SIZE_1 / FRAME_SIZE_2)], h0_2, numpy.int32(t == BIG_FRAME_SIZE) ) samples[:, t] = sample_level_generate_fn( frame_level_outputs_2[:, t % FRAME_SIZE_2], samples[:, t-FRAME_SIZE_DNN:t] ) total_time = time() - total_time log = "{} samples of {} seconds length generated in {} seconds." log = log.format(N_SEQS, N_SECS, total_time) print log, for i in xrange(N_SEQS): samp = samples[i] #pdb.set_trace() if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) #pdb.set_trace() elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag): def write_audio_file(name, data): data = data.astype('float32') data -= numpy.mean(data) data /= numpy.absolute(data).max() # [-1,1] data *= 32768 data = data.astype('int16') scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'), BITRATE, data) total_time = time() # Generate N_SEQS' sample files, each 5 seconds long N_SECS = 5 LENGTH = N_SECS * BITRATE if not args.debug else 160 #before it was 100, but 160 was better as it should be divisible by 80 if FLAG_GEN: LENGTH = 785 * 80 samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') if FLAG_USETRAIN_WHENTEST: print('') print('REMINDER: using training data for test') print('') testData_feeder = load_data_gen(train_feeder, LENGTH) else: testData_feeder = load_data_gen(test_feeder, LENGTH) mini_batch = testData_feeder.next() tmp, _, _, seqs_lab = mini_batch samples_lab = seqs_lab[:N_SEQS] if flag_dict['RMZERO']: samples[:, :BIG_FRAME_SIZE] = tmp[:N_SEQS, :BIG_FRAME_SIZE] else: samples[:, :BIG_FRAME_SIZE] = Q_ZERO samples_lab_big = get_lab_big(samples_lab) # First half zero, others fixed random at each checkpoint big_h0 = numpy.zeros((N_SEQS, N_BIG_RNN, H0_MULT * BIG_DIM), dtype='float32') h0 = numpy.zeros((N_SEQS, N_RNN, H0_MULT * DIM), dtype='float32') big_frame_level_outputs = None frame_level_outputs = None for t in xrange(BIG_FRAME_SIZE, LENGTH): if t % BIG_FRAME_SIZE == 0: tmp = samples_lab_big[:, (t - BIG_FRAME_SIZE) // BIG_FRAME_SIZE, :] tmp = tmp.reshape(tmp.shape[0], 1, tmp.shape[1]) big_frame_level_outputs, big_h0 = big_frame_level_generate_fn( samples[:, t - BIG_FRAME_SIZE:t], tmp, big_h0, numpy.int32(t == BIG_FRAME_SIZE)) if t % FRAME_SIZE == 0: tmp = samples_lab[:, (t - BIG_FRAME_SIZE) // FRAME_SIZE, :] # tmp = samples_lab[:,(t-FRAME_SIZE)//FRAME_SIZE,:] #classic, but might introduce a slight mis-alignment tmp = tmp.reshape(tmp.shape[0], 1, tmp.shape[1]) frame_level_outputs, h0 = frame_level_generate_fn( samples[:, t - FRAME_SIZE:t], tmp, big_frame_level_outputs[:, (t / FRAME_SIZE) % (BIG_FRAME_SIZE / FRAME_SIZE)], h0, numpy.int32(t == BIG_FRAME_SIZE)) samples[:, t] = sample_level_generate_fn( frame_level_outputs[:, t % FRAME_SIZE], samples[:, t - FRAME_SIZE_DNN:t]) total_time = time() - total_time log = "{} samples of {} seconds length generated in {} seconds." log = log.format(N_SEQS, N_SECS, total_time) print log, for i in xrange(N_SEQS): samp = samples[i] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag, conditioning=None): # Conditioning (N_SEQS, LENGTH) # N_SEQ = several different condition sequences, but all must have the same size... (yes, it's shitt) def write_audio_file(name, data): data = data.astype('float32') data -= data.min() data /= data.max() data -= 0.5 data *= 0.95 scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'), BITRATE, data) total_time = time() # Generate N_SEQS' sample files, each LENGHT seconds long N_SEQS = 20 if GEN_FLAG: if conditioning is not None: N_SEQS = conditioning.shape[0] LENGTH = conditioning.shape[1] if not args.debug else 100 else: if args.debug: LENGTH = 5 * BITRATE conditioning = np.ones((N_SEQS, LENGTH), dtype='int32') else: raise ("No conditionning !!") else: LENGTH = 5 * BITRATE conditioning = np.ones((N_SEQS, LENGTH), dtype='int32') if GEN_FLAG: print("Generating %d samples" % LENGTH) # Uniform [-0.5, 0.5) for half of initial state for generated samples # to study the behaviour of the model and also to introduce some diversity # to samples in a simple way. [it's disabled] fixed_rand_h0 = numpy.random.rand(N_SEQS // 2, N_RNN, H0_MULT * DIM) fixed_rand_h0 -= 0.5 fixed_rand_h0 = fixed_rand_h0.astype('float32') fixed_rand_big_h0 = numpy.random.rand(N_SEQS // 2, N_RNN, H0_MULT * DIM) fixed_rand_big_h0 -= 0.5 fixed_rand_big_h0 = fixed_rand_big_h0.astype('float32') ############################################################ ############################################################ # Initialize the sequence with zeros # Lame !?? Why not with a short "test sequence" # Would give much power to the mode with a small user cost no ? samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') samples[:, :BIG_FRAME_SIZE] = Q_ZERO ############################################################ ############################################################ # First half zero, others fixed random at each checkpoint big_h0 = numpy.zeros( (N_SEQS - fixed_rand_big_h0.shape[0], N_BIG_RNN, H0_MULT * BIG_DIM), dtype='float32') big_h0 = numpy.concatenate((big_h0, fixed_rand_big_h0), axis=0) h0 = numpy.zeros((N_SEQS - fixed_rand_h0.shape[0], N_RNN, H0_MULT * DIM), dtype='float32') h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0) big_frame_level_outputs = None frame_level_outputs = None # During generation # The BIG_FRAME_SIZE first times of samples are zeros used to initialize the sampleRNN # Hence, condi[0:BIG_FRAME_SIZE] are used to generate samples[BIG_FRAME_SIZE:2*BIG_FRAME_SIZE] for t in xrange(BIG_FRAME_SIZE, LENGTH): if GEN_FLAG: if t % 1000 == 0: print("%.2f secs generated..." % (t * 1. / BITRATE)) if t % BIG_FRAME_SIZE == 0: big_frame_level_outputs, big_h0 = big_frame_level_generate_fn( samples[:, t - BIG_FRAME_SIZE:t], conditioning[:, t - BIG_FRAME_SIZE:t], big_h0, numpy.int32(t == BIG_FRAME_SIZE)) if t % FRAME_SIZE == 0: frame_level_outputs, h0 = frame_level_generate_fn( samples[:, t - FRAME_SIZE:t], conditioning[:, t - FRAME_SIZE:t], big_frame_level_outputs[:, (t / FRAME_SIZE) % (BIG_FRAME_SIZE / FRAME_SIZE)], h0, numpy.int32(t == BIG_FRAME_SIZE)) samples[:, t] = sample_level_generate_fn( frame_level_outputs[:, t % FRAME_SIZE], conditioning[:, t - FRAME_SIZE:t], samples[:, t - FRAME_SIZE:t]) total_time = time() - total_time log = "{} samples of {} seconds length generated in {} seconds." log = log.format(N_SEQS, N_SECS, total_time) print log, for i in xrange(N_SEQS): samp = samples[i] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag): def write_audio_file(name, data): data = data.astype('float32') data -= data.min() data /= data.max() data -= 0.5 data *= 0.95 scipy.io.wavfile.write( os.path.join(SAMPLES_PATH, name+'.wav'), SAMPLERATE, data) total_time = time() # Generate N_SEQS' sample files, each N_SECS seconds long N_SECS = args.length_sec LENGTH = N_SECS*SAMPLERATE if not args.debug else 100 print("Generating %d samples"%LENGTH) samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') samples[:, :FRAME_SIZE] = Q_ZERO # First half zero, others fixed random at each checkpoint h0 = numpy.zeros( (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM), dtype='float32' ) h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0) frame_level_outputs = None for t in xrange(FRAME_SIZE, LENGTH): if t % 1000 == 0: print("%.2f secs generated..."%(t * 1./SAMPLERATE)) if t % FRAME_SIZE == 0: frame_level_outputs, h0 = frame_level_generate_fn( samples[:, t-FRAME_SIZE:t], h0, #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'), numpy.int32(t == FRAME_SIZE) ) samples[:, t] = sample_level_generate_fn( frame_level_outputs[:, t % FRAME_SIZE], samples[:, t-FRAME_SIZE:t], ) total_time = time() - total_time log = "{} seconds length generated in {} seconds." log = log.format(N_SECS, total_time) print log, now = datetime.now() for i in xrange(N_SEQS): samp = samples[i] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}_{}".format(tag, i, now.strftime('%Y%m%d_%H%M%S')), samp)
def generate_and_save_samples(tag): def write_audio_file(name, data): data = data.astype('float32') #data -= data.min() #data /= data.max() #data -= 0.5 #data *= 0.95 scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name), BITRATE, data) total_time = time() costs_g = [] accuracys_g = [] samples_low_list = [] samples_list = [] masks_g_index = [] samples_number = 0 count = 0 data_feeder = load_data(test_feeder) for seqs_g_8k, seqs_g_up, reset_g, end_flag_g, mask_g, con_g, batch_g, seqs_g_8k_real in data_feeder: if reset_g == 1: con_h0_g = numpy.zeros( (batch_g, N_CON_RNN, H0_MULT * CON_TIER_DIM), dtype='float32') big_h0_g = numpy.zeros((batch_g, N_BIG_RNN, H0_MULT * DIM), dtype='float32') h0_g = numpy.zeros((batch_g, N_RNN, H0_MULT * DIM), dtype='float32') cost_batch = np.zeros((batch_g, ), dtype='float32') accuracy_batch = np.zeros((batch_g, ), dtype='float32') mask_batch = np.zeros((batch_g, ), dtype='float32') cost_g, accuracy_g, mask_sum_g, sample, con_h0_g, big_h0_g, h0_g = test_fn( seqs_g_8k, seqs_g_up, con_g, con_h0_g, big_h0_g, h0_g, reset_g, mask_g, batch_g) cost_batch = cost_batch + cost_g accuracy_batch = accuracy_batch + accuracy_g mask_batch = mask_batch + mask_sum_g if end_flag_g == 1: costs_g.extend(list(cost_batch / mask_batch)) accuracys_g.extend(list(accuracy_batch / mask_batch)) if reset_g == 1: samples_low = seqs_g_8k_real[:, 0:-OVERLAP] samples = sample masks_g = mask_g[:, 0:-OVERLAP] else: samples_low = np.concatenate( [samples_low, seqs_g_8k_real[:, 0:-OVERLAP]], axis=1) samples = np.concatenate([samples, sample], axis=1) masks_g = np.concatenate([masks_g, mask_g[:, 0:-OVERLAP]], axis=1) if end_flag_g == 1: samples_low_list.append(samples_low) samples_list.append(samples) masks_g_index.append(masks_g) fid = open('datasets/TIMIT/test_list.scp', 'r') test_id_list = fid.readlines() for i in xrange(len(samples_list)): samples_number += samples_list[i].shape[0] * samples_list[i].shape[1] for j in xrange(samples_list[i].shape[0]): samples_lowi = samples_low_list[i][j] samplei = samples_list[i][j] maski = masks_g_index[i][j] samples_lowi = samples_lowi[0:len(np.where(maski == 1)[0])] samplei = samplei[0:len(np.where(maski == 1)[0])] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samplei = mu2linear(samplei) write_audio_file(test_id_list[count].split()[0], samplei / 3 + samples_lowi) count += 1 total_time = time() - total_time log = "192 samples generated in {} minutes.\nThe time of generating 1 second speech is {} seconds." log = log.format(total_time / 60, total_time / samples_number * 16000) print log, return numpy.mean( costs_g), numpy.mean(accuracys_g) * 100, total_time, list( np.array(accuracys_g) * 100)
def generate_and_save_samples(tag): def write_audio_file(name, data): data = data.astype('float32') # data -= data.min() # data /= data.max() # data -= 0.5 # data *= 0.95 data -= numpy.mean(data) data /= numpy.absolute(data).max() data /= 2.0 scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'), BITRATE, data) total_time = time() # Generate N_SEQS' sample files, each 5 seconds long N_SECS = 5 LENGTH = N_SECS * BITRATE if not args.debug else 100 samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') if flag_dict['RMZERO']: testData_feeder = load_data(test_feeder) mini_batch = testData_feeder.next() tmp, _, _ = mini_batch samples[:, :BIG_FRAME_SIZE] = tmp[:N_SEQS, :BIG_FRAME_SIZE] else: samples[:, :BIG_FRAME_SIZE] = Q_ZERO # First half zero, others fixed random at each checkpoint ###QDOU: soft code big_h0 = numpy.zeros( (N_SEQS - fixed_rand_big_h0.shape[0], N_BIG_RNN, H0_MULT * BIG_DIM), dtype='float32') big_h0 = numpy.concatenate((big_h0, fixed_rand_big_h0), axis=0) big_frame_level_outputs = None h0_list, frame_level_outputs_list = [], [] h0_list.append(big_h0) frame_level_outputs_list.append(big_frame_level_outputs) for idx in INTER_TIER_IDX_LIST: tmp_h0 = numpy.zeros((N_SEQS - fixed_rand_h0_list[idx].shape[0], RNN_DEPTH_LIST[idx], H0_MULT * DIM), dtype='float32') tmp_h0 = numpy.concatenate((tmp_h0, fixed_rand_h0_list[dix]), axis=0) h0_list.append(tmp_h0) frame_level_outputs_list.append(None) ###QDOU: soft code pdb.set_trace() for t in xrange(BIG_FRAME_SIZE, LENGTH): if t % BIG_FRAME_SIZE == 0: big_frame_level_outputs, big_h0 = big_frame_level_generate_fn( samples[:, t - BIG_FRAME_SIZE:t], big_h0, numpy.int32(t == BIG_FRAME_SIZE)) ###QDOU: soft code for idx in INTER_TIER_IDX_LIST: if t % FRAME_SIZE_LIST[idx] == 0: frame_level_outputs_list[idx], h0_list[idx] = gen_fn_list[idx]( samples[:, t - FRAME_SIZE_LIST[idx]:t], frame_level_outputs_list[idx - 1][:, (t / FRAME_SIZE_LIST[idx]) % (FRAME_SIZE_LIST[idx - 1] / FRAME_SIZE_LIST[idx])], h0_list[idx], numpy.int32(t == BIG_FRAME_SIZE)) ###QDOU: soft code samples[:, t] = sample_level_generate_fn( frame_level_outputs_2[:, t % FRAME_SIZE_2], samples[:, t - FRAME_SIZE_2:t]) total_time = time() - total_time log = "{} samples of {} seconds length generated in {} seconds." log = log.format(N_SEQS, N_SECS, total_time) print log, for i in xrange(N_SEQS): samp = samples[i] #pdb.set_trace() if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) #pdb.set_trace() elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag): def write_audio_file(name, data): data = data.astype('float32') data -= numpy.mean(data) data /= numpy.absolute(data).max() # [-1,1] data *= 32768 data = data.astype('int16') scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'), BITRATE, data) total_time = time() # Generate N_SEQS' sample files, each 5 seconds long N_SECS = 5 LENGTH = N_SECS * BITRATE if not args.debug else 160 #before it was 100, but 160 was better as it should be divisible by 80 if FLAG_GEN: LENGTH = 785 * 80 samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') if FLAG_USETRAIN_WHENTEST: print('') print('REMINDER: using training data for test') print('') testData_feeder = load_data_gen(train_feeder, LENGTH) else: testData_feeder = load_data_gen(test_feeder, LENGTH) mini_batch = testData_feeder.next() tmp, _, _, seqs_lab, seqs_noise = mini_batch samples_lab = seqs_lab[:N_SEQS] seqs_noise = seqs_noise[:N_SEQS] # Quantisation Steps (do this on the dataset not per minibatch) #seqs_noise = (seqs_noise / np.amax(np.abs(seqs_noise), 1)[:,None]) + 1 #seqs_noise = np.divide(np.multiply(seqs_noise, Q_LEVELS-1), 2) #seqs_noise = np.round(seqs_noise) seqs_noise = seqs_noise.astype(np.int32) if flag_dict['RMZERO']: samples[:, :BIG_FRAME_SIZE] = tmp[:N_SEQS, :BIG_FRAME_SIZE] else: samples[:, :BIG_FRAME_SIZE] = Q_ZERO samples_noise[:, :BIG_FRAME_SIZE] = Q_ZERO samples_lab_big = get_lab_big(samples_lab) # First half zero, others fixed random at each checkpoint big_h0 = numpy.zeros((N_SEQS, N_BIG_RNN, H0_MULT * BIG_DIM), dtype='float32') h0 = numpy.zeros((N_SEQS, N_RNN, H0_MULT * DIM), dtype='float32') big_frame_level_outputs = None frame_level_outputs = None # LENGTH is length of utterance to generate. # Take one frame of silence, then start at index BIG_FRAME_SIZE. # Do this for training and debugging. # As the RNN needs initial state. # Once model is good enough, actually use 20 frames. for t in xrange(BIG_FRAME_SIZE, LENGTH): # for loop going sample by sample if t % BIG_FRAME_SIZE == 0: tmp = samples_lab_big[:, (t - BIG_FRAME_SIZE) // BIG_FRAME_SIZE, :] tmp = tmp.reshape(tmp.shape[0], 1, tmp.shape[1]) big_frame_level_outputs, big_h0 = big_frame_level_generate_fn( seqs_noise[:, t - BIG_FRAME_SIZE:t], tmp, big_h0, numpy.int32(t == BIG_FRAME_SIZE)) if t % FRAME_SIZE == 0: tmp = samples_lab[:, (t - BIG_FRAME_SIZE) // FRAME_SIZE, :] # tmp = samples_lab[:,(t-FRAME_SIZE)//FRAME_SIZE,:] #classic, but might introduce a slight mis-alignment tmp = tmp.reshape(tmp.shape[0], 1, tmp.shape[1]) frame_level_outputs, h0 = frame_level_generate_fn( seqs_noise[:, t - FRAME_SIZE:t], tmp, big_frame_level_outputs[:, (t / FRAME_SIZE) % (BIG_FRAME_SIZE / FRAME_SIZE)], h0, numpy.int32(t == BIG_FRAME_SIZE)) samples[:, t] = sample_level_generate_fn( frame_level_outputs[:, t % FRAME_SIZE], seqs_noise[:, t - FRAME_SIZE_DNN:t]) total_time = time() - total_time log = "{} samples of {} seconds length generated in {} seconds." log = log.format(N_SEQS, N_SECS, total_time) print log, for i in xrange(N_SEQS): samp = samples[i] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag): def write_audio_file(name, data): data = data.astype('float32') data -= data.min() data /= data.max() data -= 0.5 data *= 0.95 scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'), BITRATE, data) total_time = time() # Generate N_SEQS' sample files, each 5 seconds long N_SECS = 5 LENGTH = N_SECS * BITRATE if not args.debug else 100 #sid=numpy.int16(10) # specify the speaker ID #g_spkids=numpy.empty(0) g_spkids = [] for i in range(15): for j in range(20): g_spkids = numpy.append(g_spkids, i) g_spkids = numpy.int16(g_spkids) g_spkids = numpy.asarray(g_spkids, dtype='int16') print g_spkids samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') samples[:, :FRAME_SIZE] = Q_ZERO # First half zero, others fixed random at each checkpoint h0 = numpy.zeros((N_SEQS - fixed_rand_h0.shape[0], N_RNN, H0_MULT * DIM), dtype='float32') h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0) frame_level_outputs = None for t in xrange(FRAME_SIZE, LENGTH): if t % FRAME_SIZE == 0: frame_level_outputs, h0 = frame_level_generate_fn( samples[:, t - FRAME_SIZE:t], g_spkids, h0, #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'), numpy.int32(t == FRAME_SIZE)) samples[:, t] = sample_level_generate_fn( frame_level_outputs[:, t % FRAME_SIZE], samples[:, t - FRAME_SIZE:t], g_spkids, ) total_time = time() - total_time log = "{} samples of {} seconds length generated in {} seconds." log = log.format(N_SEQS, N_SECS, total_time) print log, for i in xrange(N_SEQS): samp = samples[i] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samp = mu2linear(samp) elif Q_TYPE == 'a-law': raise NotImplementedError('a-law is not implemented') write_audio_file("sample_{}_{}".format(tag, i), samp)
def generate_and_save_samples(tag): def write_audio_file(name, data): data = data.astype('float32') #data -= data.min() #data /= data.max() #data -= 0.5 #data *= 0.95 scipy.io.wavfile.write(os.path.join(SAMPLES_PATH, name + '.wav'), BITRATE, data) total_time = time() costs_g = [] accuracys_g = [] count = 0 data_feeder = load_data(test_feeder) for seqs_g_8k, seqs_g_up, reset_g, end_flag_g, mask_g, con_g, batch_g, seqs_g_8k_real in data_feeder: if reset_g == 1: con_h0_g = numpy.zeros( (batch_g, N_CON_RNN, H0_MULT * CON_TIER_DIM), dtype='float32') big_h0_g = numpy.zeros((batch_g, N_BIG_RNN, H0_MULT * DIM), dtype='float32') h0_g = numpy.zeros((batch_g, N_RNN, H0_MULT * DIM), dtype='float32') cost_batch = np.zeros((batch_g, ), dtype='float32') accuracy_batch = np.zeros((batch_g, ), dtype='float32') mask_batch = np.zeros((batch_g, ), dtype='float32') count += 1 cost_g, accuracy_g, mask_sum_g, sample, con_h0_g, big_h0_g, h0_g = test_fn( seqs_g_8k, seqs_g_up, con_g, con_h0_g, big_h0_g, h0_g, reset_g, mask_g, batch_g) cost_batch = cost_batch + cost_g accuracy_batch = accuracy_batch + accuracy_g mask_batch = mask_batch + mask_sum_g if end_flag_g == 1: costs_g.extend(list(cost_batch / mask_batch)) accuracys_g.extend(list(accuracy_batch / mask_batch)) if count == 1: if reset_g == 1: samples_low = seqs_g_8k_real[:, 0:-OVERLAP] samples = sample masks_g = mask_g[:, 0:-OVERLAP] else: samples_low = np.concatenate( [samples_low, seqs_g_8k_real[:, 0:-OVERLAP]], axis=1) samples = np.concatenate([samples, sample], axis=1) masks_g = np.concatenate([masks_g, mask_g[:, 0:-OVERLAP]], axis=1) for i in xrange(N_SEQS): samples_lowi = samples_low[i] samplei = samples[i] maski = masks_g[i] samples_lowi = samples_lowi[0:len(np.where(maski == 1)[0])] samplei = samplei[0:len(np.where(maski == 1)[0])] if Q_TYPE == 'mu-law': from datasets.dataset import mu2linear samplei = mu2linear(samplei) write_audio_file("sample_{}_{}".format(tag, i), samplei / 3 + samples_lowi) total_time = time() - total_time log = "{} samples generated in {} seconds." log = log.format(N_SEQS, total_time) print log, return numpy.mean(costs_g), numpy.mean(accuracys_g) * 100, total_time