def get_subseq(dataset, batch_size, seq_len, overlap, q_type, q_levels): for batch in dataset: batch = quantize(batch, q_type, q_levels) num_samps = len(batch[0]) for i in range(overlap, num_samps, seq_len): x = batch[:, i - overlap:i + seq_len] y = x[:, overlap:overlap + seq_len] yield (x, y)
def generate(path, ckpt_path, config, num_seqs=NUM_SEQS, dur=OUTPUT_DUR, sample_rate=SAMPLE_RATE, temperature=SAMPLING_TEMPERATURE, seed=None, seed_offset=None): model = create_inference_model(ckpt_path, num_seqs, config) q_type = model.q_type q_levels = model.q_levels q_zero = q_levels // 2 num_samps = dur * sample_rate temperature = get_temperature(temperature, num_seqs) # Precompute sample sequences, initialised to q_zero. samples = [] init_samples = np.full((model.batch_size, model.big_frame_size, 1), q_zero) # Set seed if provided. if seed is not None: seed_audio = load_seed_audio(seed, seed_offset, model.big_frame_size) seed_audio = tf.convert_to_tensor(seed_audio) init_samples[:, :model.big_frame_size, :] = quantize( seed_audio, q_type, q_levels) init_samples = tf.constant(init_samples, dtype=tf.int32) samples.append(init_samples) print_progress_every = NUM_FRAMES_TO_PRINT * model.big_frame_size start_time = time.time() for i in range(0, num_samps // model.big_frame_size): t = i * model.big_frame_size # Generate samples frame_samples = model(samples[i], training=False, temperature=temperature) samples.append(frame_samples) # Monitor progress if t % print_progress_every == 0: end = min(t + print_progress_every, num_samps) step_dur = time.time() - start_time print( f'Generated samples {t+1} - {end} of {num_samps} (time elapsed: {step_dur:.3f} seconds)' ) samples = tf.concat(samples, axis=1) samples = samples[:, model.big_frame_size:, :] # Save sequences to disk path = path.split('.wav')[0] for i in range(model.batch_size): seq = np.reshape(samples[i], (-1, 1))[model.big_frame_size:].tolist() audio = dequantize(seq, q_type, q_levels) file_name = '{}_{}'.format(path, str(i)) if model.batch_size > 1 else path file_name = '{}.wav'.format(file_name) write_wav(file_name, audio, sample_rate) print('Generated sample output to {}'.format(file_name)) print('Done')
def train_step(inputs): with tf.GradientTape() as tape: inputs = quantize(inputs, q_type, q_levels) raw_output = model(inputs, training=True) prediction = tf.reshape(raw_output, [-1, q_levels]) target = tf.reshape(inputs[:, model.big_frame_size:, :], [-1]) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=prediction, labels=target)) grads = tape.gradient(loss, model.trainable_variables) grads, _ = tf.clip_by_global_norm(grads, 5.0) opt.apply_gradients(list(zip(grads, model.trainable_variables))) train_accuracy.update_state(target, prediction) return loss
def generate_and_save_samples(model, path, seed, seed_offset=0, dur=OUTPUT_DUR, sample_rate=SAMPLE_RATE, temperature=SAMPLING_TEMPERATURE): q_type = model.q_type q_levels = model.q_levels q_zero = q_levels // 2 num_samps = dur * sample_rate # Precompute sample sequences, initialised to q_zero. samples = np.full((model.batch_size, model.big_frame_size + num_samps, 1), q_zero, dtype='int32') # Set seed if provided. if seed is not None: seed_audio = load_seed_audio(seed, seed_offset, model.big_frame_size) samples[:, :model.big_frame_size, :] = quantize(seed_audio, q_type, q_levels) print_progress_every = 250 start_time = time.time() # Run the model tiers. Generates a single sample per step. Each frame-level tier # consumes one frame of samples per step. for t in range(model.big_frame_size, model.big_frame_size + num_samps): # Top tier (runs every big_frame_size steps) if t % model.big_frame_size == 0: inputs = samples[:, t - model.big_frame_size : t, :].astype('float32') big_frame_outputs = model.big_frame_rnn(inputs) # Middle tier (runs every frame_size steps) if t % model.frame_size == 0: inputs = samples[:, t - model.frame_size : t, :].astype('float32') big_frame_output_idx = (t // model.frame_size) % ( model.big_frame_size // model.frame_size ) frame_outputs = model.frame_rnn( inputs, conditioning_frames=unsqueeze(big_frame_outputs[:, big_frame_output_idx, :], 1)) # Sample level tier (runs once per step) inputs = samples[:, t - model.frame_size : t, :] frame_output_idx = t % model.frame_size sample_outputs = model.sample_mlp( inputs, conditioning_frames=unsqueeze(frame_outputs[:, frame_output_idx, :], 1)) # Generate sample_outputs = tf.reshape(sample_outputs, [-1, q_levels]) generated = sample(sample_outputs, temperature) # Monitor progress start = t - model.big_frame_size if start % print_progress_every == 0: end = min(start + print_progress_every, num_samps) duration = time.time() - start_time template = 'Generating samples {} - {} of {} (time elapsed: {:.3f} seconds)' print(template.format(start+1, end, num_samps, duration)) # Update sequences samples[:, t] = np.array(generated).reshape([-1, 1]) # Save sequences to disk path = path.split('.wav')[0] for i in range(model.batch_size): seq = samples[i].reshape([-1, 1])[model.big_frame_size :].tolist() audio = dequantize(seq, q_type, q_levels) file_name = '{}_{}'.format(path, str(i)) if model.batch_size > 1 else path file_name = '{}.wav'.format(file_name) write_wav(file_name, audio, sample_rate) print('Generated sample output to {}'.format(file_name)) print('Done')
def generate(model, num_seqs=NUM_SEQS, dur=OUTPUT_DUR, sample_rate=SAMPLE_RATE, temperature=SAMPLING_TEMPERATURE, seed=None, seed_offset=None): q_type = model.q_type q_levels = model.q_levels q_zero = q_levels // 2 num_samps = dur * sample_rate # print("generate()") # print(f" num_samps={num_samps}") # 128000 # print(f" temperature={temperature}") temperature = get_temperature(temperature, num_seqs, num_samps, dur) # print(f" temperature'.shape={temperature.shape}") # Precompute sample sequences, initialised to q_zero. samples = [] init_samples = np.full((model.batch_size, model.big_frame_size, 1), q_zero) # Set seed if provided. if seed: seed_audio = seed seed_audio = tf.convert_to_tensor(seed_audio) init_samples[:, :model.big_frame_size, :] = quantize( seed_audio, q_type, q_levels) init_samples = tf.constant(init_samples, dtype=tf.int32) samples.append(init_samples) # print(f" len(samples)={len(samples)}") # print(f" samples[0].shape={samples[0].shape}") # (1,64,1) print_progress_every = NUM_FRAMES_TO_PRINT * model.big_frame_size start_time = time.time() stats = [0.0] * 10 for i in range(0, num_samps // model.big_frame_size): t = i * model.big_frame_size # Generate samples temp = temperature if temp.shape[-1] > 1: start = i * model.big_frame_size stop = (i + 1) * model.big_frame_size temp = temperature[:, start:stop] # print(f" temp.shape={temp.shape}") gen_start_time = time.time() frame_samples = model(samples[i], training=False, temperature=temp) # print(f" frame_samples.shape={frame_samples.shape}") gen_end_time = time.time() samples.append(frame_samples) # print(f" len(samples')={len(samples)}") del stats[0] stats.append(gen_end_time - gen_start_time) # Monitor progress if t % print_progress_every == 0: end = min(t + print_progress_every, num_samps) step_dur = time.time() - start_time stats_num = min(i + 1, len(stats)) * model.big_frame_size stats_dur = sum(stats) time_rem = 0 if stats_dur > 0: rate = stats_num / stats_dur num_rem = num_samps - t time_rem = int(round(num_rem / rate)) remaining = format_dur(time_rem) print( f'Generated samples {t+1} - {end} of {num_samps} (time elapsed: {step_dur:.3f} seconds, remaining: {remaining})' ) samples = tf.concat(samples, axis=1) samples = samples[:, model.big_frame_size:, :] # Save sequences to disk for i in range(model.batch_size): seq = np.reshape(samples[i], (-1, 1))[model.big_frame_size:].tolist() audio = dequantize(seq, q_type, q_levels) yield audio.numpy() print('Done')