def initialize_networks(args, device): # network En_A = networks.encoder(in_nc=args.in_ngc, nf=args.ngf, img_size=args.img_size).to(device) En_B = networks.encoder(in_nc=args.in_ngc, nf=args.ngf, img_size=args.img_size).to(device) De_A = networks.decoder(out_nc=args.out_ngc, nf=args.ngf).to(device) De_B = networks.decoder(out_nc=args.out_ngc, nf=args.ngf).to(device) Disc_A = networks.discriminator(in_nc=args.in_ndc, out_nc=args.out_ndc, nf=args.ndf, img_size=args.img_size).to(device) Disc_B = networks.discriminator(in_nc=args.in_ndc, out_nc=args.out_ndc, nf=args.ndf, img_size=args.img_size).to(device) print('---------- Networks initialized -------------') utils.print_network(En_A) utils.print_network(En_B) utils.print_network(De_A) utils.print_network(De_B) utils.print_network(Disc_A) utils.print_network(Disc_B) print('-----------------------------------------------') all_networks = [En_A, En_B, De_A, De_B, Disc_A, Disc_B] return all_networks
def generator(speaker_embedding, inputs, is_training=True, scope_name='generator', reuse=None): '''Generate features. Args: speaker_embedding: A `Tensor` with type `float32` contains speaker information. [N, E] inputs: A `Tensor` with type `float32` contains speech features. is_training: Boolean, whether to train or inference. scope_name: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: A decoded `Tensor` with aim speaker. vae mu vector. vae log_var vector. ''' with tf.variable_scope(scope_name, reuse=reuse): sample, mu, log_var = encoder(inputs, is_training=is_training, scope='vae_encoder') # [N, T, E] #speaker_embedding = tf.expand_dims(speaker_embedding, axis=1) # [N, 1, E] speaker_embedding = tf.tile(speaker_embedding, [1, tf.shape(sample)[1], 1]) # [N, T, E] encoded = tf.concat((speaker_embedding, sample), axis=-1) # [N, T, E+G] outputs = decoder(encoded, is_training=is_training, scope='vae_decoder') return outputs, mu, log_var # [N, T, C]
def convert_encoder_to_pb(): graph = tf.Graph() config = tf.ConfigProto() config.gpu_options.visible_device_list = GPU_TO_USE with graph.as_default(): raw_images = tf.placeholder(dtype=tf.uint8, shape=[None, None, None, 3], name='images') names = ['Relu_{}_1'.format(X) for X in range(1, 6)] features = encoder(tf.to_float(raw_images)) features = [tf.identity(features[n], n) for n in names] tf.train.init_from_checkpoint('pretrained/vgg_19.ckpt', {'vgg_19/': 'encoder/'}) with tf.Session(graph=graph, config=config) as sess: sess.run(tf.global_variables_initializer()) # output ops keep_nodes = names input_graph_def = tf.graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), output_node_names=keep_nodes) output_graph_def = tf.graph_util.remove_training_nodes( input_graph_def, protected_nodes=keep_nodes) with tf.gfile.GFile('inference/encoder.pb', 'wb') as f: f.write(output_graph_def.SerializeToString()) print('%d ops in the final graph.' % len(output_graph_def.node))
def train_autoencoder(X_dir, Y_dir, batch_size, dim, X_channels, Y_channels, log_dir, shuffle, **kwargs): # Dataset pairs_filename = load_dataset(X_dir, Y_dir) partition = partition_dataset(pairs_filename) # Generators training_generator = DataGenerator(partition['train'], batch_size, dim, X_channels, Y_channels, shuffle) validation_generator = DataGenerator(partition['validation'], batch_size, dim, X_channels, Y_channels, shuffle) # Design model input_img = Input(shape=(*dim, X_channels)) encoder_img = encoder(n_features=8) decoder_lbl = decoder(n_output_features=Y_channels, n_features=8) latent_img = encoder_img(input_img) latent_lbl = latent_img # TODO Put res_net here for image to label translation restored_lbl = decoder_lbl(latent_lbl) img2lbl = Model(input_img, restored_lbl) img2lbl.compile(optimizer='adadelta', loss='mean_squared_error') # Print summary img2lbl.summary() print('Model contains a total of %d trainable layers.\n' % len(img2lbl.trainable_weights)) # Train model tbi_callback = TensorBoardImage(log_dir=log_dir, validation_data=validation_generator) tb_callback = TensorBoard(log_dir=log_dir) img2lbl.fit_generator(generator=training_generator, validation_data=validation_generator, epochs=50, callbacks=[tb_callback, tbi_callback], use_multiprocessing=True, workers=2)
def generator(inputs, is_training=True, scope_name='generator', reuse=None): with tf.variable_scope(scope_name, reuse=reuse): sample, mu, log_var = encoder(inputs, is_training=is_training, scope='vae_encoder') # [N, T, E] # speaker_embedding = tf.tile(speaker_embedding, [1, tf.shape(sample)[1], 1]) # [N, T, E] # tf.tile() 用来对张量(Tensor)进行扩展的,表示每一维度,拓展复制几次; # encoded = tf.concat((speaker_embedding, sample), axis=-1) # [N, T, E+G] outputs = decoder(sample, is_training=is_training, scope='vae_decoder') return outputs, mu, log_var # [N, T, C]
def get_encoding_mu_logvar(encoding_image): """Computes the encoding_image's style noise parameters. Args: encoding_image: [B, H, W, 4] input RGBD image to run Infinite Nature on Returns: tuple of tensors ([B, z_dim], [B, z_dim]) corresponding to mu and logvar normal parameters. """ with tf.compat.v1.variable_scope("generator", reuse=tf.compat.v1.AUTO_REUSE): mu_logvar = networks.encoder(encoding_image) return mu_logvar
def forward_pass(self, inputs, is_training, reuse=False): enc_z, enc_mean, enc_Sigma = encoder(self.opts, input=inputs, output_dim=2 * self.opts['zdim'], scope='encoder', reuse=reuse, is_training=is_training) dec_x, dec_mean = decoder(self.opts, input=enc_z, output_dim=self.output_dim, scope='decoder', reuse=reuse, is_training=is_training) return enc_z, enc_mean, enc_Sigma, dec_x, dec_mean
def forward_pass(self, inputs, is_training, reuse=False): """Performs a full pass over the model. inputs: [batch,imgdim] return: enc_cat_logits: [batch,K] enc_z/enc_gauss_mean/enc_gauss_Sigma: [batch,K,zdim] dec_mean, dec_Sigma: [batch,K,imgdim] """ # Encode enc_cat_logits, enc_gauss_mean, enc_gauss_Sigma = encoder( self.opts, input=inputs, cat_output_dim=self.opts['nmixtures'], gaus_output_dim=2 * self.opts['zdim'], scope='encoder', reuse=reuse, is_training=is_training) enc_gauss_mean = tf.reshape( enc_gauss_mean, [-1, self.opts['nmixtures'], self.opts['zdim']]) enc_gauss_Sigma = tf.reshape( enc_gauss_Sigma, [-1, self.opts['nmixtures'], self.opts['zdim']]) enc_z = sample_all_gmm(self.opts, enc_gauss_mean, enc_gauss_Sigma) #[batch,nmixtures,zdim] enc_z_flat = tf.reshape(enc_z, [-1, self.opts['zdim']]) # Decode dec_mean, dec_Sigma = decoder(self.opts, input=enc_z_flat, output_dim=self.output_dim, scope='decoder', reuse=reuse, is_training=is_training) outshape = [ -1, self.opts['nmixtures'], np.prod(datashapes[self.opts['dataset']]) ] dec_mean = tf.reshape(dec_mean, outshape) dec_Sigma = tf.reshape(dec_Sigma, outshape) return enc_cat_logits, enc_z, enc_gauss_mean, enc_gauss_Sigma, dec_mean, dec_Sigma
def __init__(self, options): super(ArtGAN, self).__init__() # build model self.encoder = encoder(options) self.decoder = decoder(options) self.discriminator = discriminator(options) self.discriminator_weight = { "pred_1": 1., "pred_2": 1., "pred_4": 1., "pred_6": 1., "pred_7": 1. } self.loss = nn.BCEWithLogitsLoss(reduction='mean') self.mse = nn.MSELoss(reduction='mean') self.abs = nn.L1Loss(reduction='mean') # Setup the optimizers dis_params = list(self.discriminator.parameters()) gen_params = list(self.encoder.parameters()) + list( self.decoder.parameters()) self.dis_opt = torch.optim.Adam( [p for p in dis_params if p.requires_grad], lr=options.lr, betas=(0.5, 0.999), weight_decay=0.0001, amsgrad=True) self.gen_opt = torch.optim.Adam( [p for p in gen_params if p.requires_grad], lr=options.lr, betas=(0.5, 0.999), weight_decay=0.0001, amsgrad=True) self.dis_scheduler = get_scheduler(self.dis_opt, options) self.gen_scheduler = get_scheduler(self.gen_opt, options) # Network weight initialization self.apply(weights_init(options.init)) self.discriminator.apply(weights_init('gaussian')) self.gener_loss = torch.tensor(0.) self.discr_loss = torch.tensor(0.)
def __init__(self, mode="train"): """ Initialize the class based off of the given mode :param mode: the mode to load the model based on """ print("Loading your model...") # Initialize values used in class self.mode = mode self.global_step = None self.mel_loss = None self.mel_loss = None self.mag_loss = None self.learning_rate = None self.optimizer = None self.merged = None self.gradients = None self.clipped = None self.gvs = None self.opt_train = None # If is_training if mode == "train": self.is_training = True else: self.is_training = False print("Loading inputs...") # Load inputs if self.is_training: self.txt, self.mels, self.mags, self.file_names, self.num_batch = get_batch( ) elif mode == "synthesize": self.txt = tf.placeholder(tf.int32, shape=(None, None)) self.mels = tf.placeholder(tf.float32, shape=(None, None, N_MELS * REDUCTION_FACTOR)) else: # eval self.txt = tf.placeholder(tf.int32, shape=(None, None)) self.mels = tf.placeholder(tf.float32, shape=(None, None, N_MELS * REDUCTION_FACTOR)) self.mags = tf.placeholder(tf.float32, shape=(None, None, 1 + N_FFT // 2)) self.file_names = tf.placeholder(tf.string, shape=(None, )) # decoder inputs self.decoder_inputs = tf.concat( (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1) self.decoder_inputs = self.decoder_inputs[:, :, -N_MELS:] # Networks with tf.variable_scope("Networks"): print("Loading the encoder...") # encoder self.memory = encoder(self.txt, is_training=self.is_training) print("Loading the decoder...") # decoder self.mel_hat, self.alignments = decoder( self.decoder_inputs, self.memory, is_training=self.is_training) print("Loading the post CBHG module...") # CBHG Module self.mags_hat = cbhg_helper(self.mel_hat, N_MELS, is_training=self.is_training, post=True) print("Audio out") # audio self.audio_out = tf.py_func(spectrogram2wav, [self.mags_hat[0]], tf.float32) # Training and evaluation if mode in ("train", "eval"): print("Generating Loss...") # Loss self.loss = self.get_loss() print("Getting the optimizer ready...") # Training Scheme self.optimize() print("Setting up your summary...") self.summarize()
def __init__(self, training=True): # Load vocabulary self.char2idx, self.idx2char = load_vocab() self.graph = tf.Graph() with self.graph.as_default(): # Data Feeding ## x: Text. (N, T_x), int32 ## y1: Reduced melspectrogram. (N, T_y//r, n_mels*r) float32 ## y2: Reduced dones. (N, T_y//r,) int32 ## z: Magnitude. (N, T_y, n_fft//2+1) float32 if training: self.x, self.y1, self.y2, self.z, self.num_batch = get_batch() self.prev_max_attentions = tf.constant([0] * hp.batch_size) else: # Evaluation self.x = tf.placeholder(tf.int32, shape=(hp.batch_size, hp.T_x)) self.y1 = tf.placeholder(tf.float32, shape=(hp.batch_size, hp.T_y // hp.r, hp.n_mels * hp.r)) self.prev_max_attentions = tf.placeholder( tf.int32, shape=(hp.batch_size, )) # Get decoder inputs: feed last frames only (N, T_y//r, n_mels) self.decoder_input = tf.concat((tf.zeros_like( self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks with tf.variable_scope("net"): # Encoder. keys: (N, T_x, e), vals: (N, T_x, e) self.keys, self.vals, self.masks = encoder(self.x, training=training, scope="encoder") # Decoder. mel_output: (N, T_y/r, n_mels*r), done_output: (N, T_y/r, 2), # decoder_output: (N, T_y/r, e), alignments: (N, T_y, T_x) self.mel_output, self.done_output, self.decoder_output, self.alignments, self.max_attentions = decoder( self.decoder_input, self.keys, self.vals, self.masks, self.prev_max_attentions, training=training, scope="decoder", reuse=None) # Restore shape. converter_input: (N, T_y, e/r) self.converter_input = tf.reshape(self.decoder_output, (hp.batch_size, hp.T_y, -1)) self.converter_input = normalize(self.converter_input, type=hp.norm_type, training=training, activation_fn=tf.nn.relu) # Converter. mag_output: (N, T_y, 1+n_fft//2) self.mag_output = converter(self.converter_input, training=training, scope="converter") if training: # Loss self.loss1_mae = tf.reduce_mean( tf.abs(self.mel_output - self.y1)) self.loss1_ce = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.done_output, labels=self.y2)) self.loss2 = tf.reduce_mean(tf.abs(self.mag_output - self.z)) self.loss = self.loss1_mae + self.loss1_ce + self.loss2 # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_value(grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = tf.clip_by_norm(grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('loss', self.loss) tf.summary.scalar('loss1_mae', self.loss1_mae) tf.summary.scalar('loss1_ce', self.loss1_ce) tf.summary.scalar('loss2', self.loss2) self.merged = tf.summary.merge_all()
transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) train_loader_A = utils.data_load(os.path.join('data', args.dataset), 'trainA', transform, args.batch_size, shuffle=True, drop_last=True) train_loader_B = utils.data_load(os.path.join('data', args.dataset), 'trainB', transform, args.batch_size, shuffle=True, drop_last=True) test_loader_A = utils.data_load(os.path.join('data', args.dataset), 'testA', transform, 1, shuffle=True, drop_last=True) test_loader_B = utils.data_load(os.path.join('data', args.dataset), 'testB', transform, 1, shuffle=True, drop_last=True) print('------------ Datasets -------------') print('TrainA:', len(train_loader_A)) print('TrainB:', len(train_loader_B)) print('TestA:', len(test_loader_A)) print('TestB:', len(test_loader_B)) print('-------------- End ----------------') # network En_A = networks.encoder(in_nc=args.in_ngc, nf=args.ngf, img_size=args.img_size).to(device) En_B = networks.encoder(in_nc=args.in_ngc, nf=args.ngf, img_size=args.img_size).to(device) De_A = networks.decoder(out_nc=args.out_ngc, nf=args.ngf).to(device) De_B = networks.decoder(out_nc=args.out_ngc, nf=args.ngf).to(device) Disc_A = networks.discriminator(in_nc=args.in_ndc, out_nc=args.out_ndc, nf=args.ndf, img_size=args.img_size).to(device) Disc_B = networks.discriminator(in_nc=args.in_ndc, out_nc=args.out_ndc, nf=args.ndf, img_size=args.img_size).to(device) En_A.train() En_B.train() De_A.train() De_B.train() Disc_A.train() Disc_B.train() print('---------- Networks initialized -------------') utils.print_network(En_A) utils.print_network(En_B) utils.print_network(De_A)
def __init__(self, mode="train"): # Load vocabulary self.char2idx, self.idx2char = load_vocab() # Set phase is_training=True if mode=="train" else False # Graph # Data Feeding # x: Text. (N, Tx) # y: Reduced melspectrogram. (N, Ty//r, n_mels*r) # z: Magnitude. (N, Ty, n_fft//2+1) if mode=="train": self.x, self.y, self.z, self.fnames, self.num_batch = get_batch() elif mode=="eval": self.x = tf.placeholder(tf.int32, shape=(None, None)) self.y = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels*hp.r)) self.z = tf.placeholder(tf.float32, shape=(None, None, 1+hp.n_fft//2)) self.fnames = tf.placeholder(tf.string, shape=(None,)) else: # Synthesize self.x = tf.placeholder(tf.int32, shape=(None, None)) self.y = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels * hp.r)) # Get encoder/decoder inputs self.encoder_inputs = embed(self.x, len(hp.vocab), hp.embed_size) # (N, T_x, E) self.decoder_inputs = tf.concat((tf.zeros_like(self.y[:, :1, :]), self.y[:, :-1, :]), 1) # (N, Ty/r, n_mels*r) self.decoder_inputs = self.decoder_inputs[:, :, -hp.n_mels:] # feed last frames only (N, Ty/r, n_mels) # Networks with tf.variable_scope("net"): # Encoder self.memory = encoder(self.encoder_inputs, is_training=is_training) # (N, T_x, E) # Decoder1 self.y_hat, self.alignments = decoder1(self.decoder_inputs, self.memory, is_training=is_training) # (N, T_y//r, n_mels*r) # Decoder2 or postprocessing self.z_hat = decoder2(self.y_hat, is_training=is_training) # (N, T_y//r, (1+n_fft//2)*r) # monitor self.audio = tf.py_func(spectrogram2wav, [self.z_hat[0]], tf.float32) if mode in ("train", "eval"): # Loss self.loss1 = tf.reduce_mean(tf.abs(self.y_hat - self.y)) self.loss2 = tf.reduce_mean(tf.abs(self.z_hat - self.z)) self.loss = self.loss1 + self.loss2 # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.lr = learning_rate_decay(hp.lr, global_step=self.global_step) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_norm(grad, 5.) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('{}/loss1'.format(mode), self.loss1) tf.summary.scalar('{}/loss'.format(mode), self.loss) tf.summary.scalar('{}/lr'.format(mode), self.lr) tf.summary.image("{}/mel_gt".format(mode), tf.expand_dims(self.y, -1), max_outputs=1) tf.summary.image("{}/mel_hat".format(mode), tf.expand_dims(self.y_hat, -1), max_outputs=1) tf.summary.image("{}/mag_gt".format(mode), tf.expand_dims(self.z, -1), max_outputs=1) tf.summary.image("{}/mag_hat".format(mode), tf.expand_dims(self.z_hat, -1), max_outputs=1) tf.summary.audio("{}/sample".format(mode), tf.expand_dims(self.audio, 0), hp.sr) self.merged = tf.summary.merge_all()
def model_fn(features, labels, mode, params, config): """ This is a function for creating a computational tensorflow graph. The function is in format required by tf.estimator. """ images = features is_training = mode == tf.estimator.ModeKeys.TRAIN # build the main graph feature_to_use = params['feature_to_use'] # Relu_X_1 encoding = encoder(images)[feature_to_use] restored_images = decoder(encoding, feature_to_use) encoding_of_restored_images = encoder(restored_images)[feature_to_use] # use a pretrained backbone network if is_training: with tf.name_scope('init_from_checkpoint'): tf.train.init_from_checkpoint(params['pretrained_checkpoint'], {'vgg_19/': 'encoder/'}) assert mode != tf.estimator.ModeKeys.PREDICT # add L2 regularization with tf.name_scope('weight_decay'): add_weight_decay(params['weight_decay']) regularization_loss = tf.losses.get_regularization_loss() batch_size = tf.to_float(tf.shape(images)[0]) normalizer = 255.0 * batch_size reconstruction_loss = tf.nn.l2_loss(images - restored_images) / normalizer features_loss = tf.nn.l2_loss(encoding - encoding_of_restored_images) / normalizer tf.losses.add_loss(reconstruction_loss) tf.losses.add_loss(params['lambda'] * features_loss) tf.summary.scalar('regularization_loss', regularization_loss) tf.summary.scalar('reconstruction_loss', reconstruction_loss) tf.summary.scalar('features_loss', features_loss) total_loss = tf.losses.get_total_loss(add_regularization_losses=True) if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = { 'val_reconstruction_loss': tf.metrics.mean(reconstruction_loss), 'val_features_loss': tf.metrics.mean(features_loss) } return tf.estimator.EstimatorSpec(mode, loss=total_loss, eval_metric_ops=eval_metric_ops) assert mode == tf.estimator.ModeKeys.TRAIN with tf.variable_scope('learning_rate'): global_step = tf.train.get_global_step() learning_rate = tf.train.polynomial_decay( params['initial_learning_rate'], global_step, params['num_steps'], params['end_learning_rate'], power=1.0 # linear decay ) tf.summary.scalar('learning_rate', learning_rate) with tf.variable_scope('optimizer'): optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.999) train_op = optimizer.minimize(total_loss, global_step=global_step) return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
def __init__(self, mode="train"): # Load vocabulary self.char2idx, self.idx2char = load_vocab() # Set phase is_training = True if mode == "train" else False # Graph # Data Feeding # x: Text. (N, Tx) # y: Reduced melspectrogram. (N, Ty//r, n_mels*r) # z: Magnitude. (N, Ty, n_fft//2+1) if mode == "train": self.x, self.y, self.z, self.fnames, self.num_batch = get_batch() elif mode == "eval": self.x = tf.placeholder(tf.int32, shape=(None, None)) self.y = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels * hp.r)) self.z = tf.placeholder(tf.float32, shape=(None, None, 1 + hp.n_fft // 2)) self.fnames = tf.placeholder(tf.string, shape=(None, )) else: # Synthesize self.x = tf.placeholder(tf.int32, shape=(None, None)) self.y = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels * hp.r)) # Get encoder/decoder inputs self.encoder_inputs = embed(self.x, len(hp.vocab), hp.embed_size) # (N, T_x, E) self.decoder_inputs = tf.concat( (tf.zeros_like(self.y[:, :1, :]), self.y[:, :-1, :]), 1) # (N, Ty/r, n_mels*r) self.decoder_inputs = self.decoder_inputs[:, :, -hp. n_mels:] # feed last frames only (N, Ty/r, n_mels) # Networks with tf.variable_scope("net"): # Encoder self.memory = encoder(self.encoder_inputs, is_training=is_training) # (N, T_x, E) # Decoder1 if int(hp.schedule_prob) == 1: self.y_hat, self.alignments = decoder1(self.decoder_inputs, self.memory, is_training=is_training) else: self.y_hat, self.alignments = decoder1_scheduled( self.decoder_inputs, self.memory, is_training=is_training, schedule=hp.schedule_prob) # (N, T_y//r, n_mels*r) # Guided attention loss batch_size, N, T = tf.shape(self.alignments)[0], tf.shape( self.alignments)[1], tf.shape(self.alignments)[2] g = 0.2 Ns = tf.tile(tf.expand_dims(tf.range(N) / N, 1), [1, T]) # shape: [N, T] Ts = tf.tile(tf.expand_dims(tf.range(T) / T, 0), [N, 1]) # shape: [N, T] W = tf.ones([N, T]) - tf.exp( -1 * (tf.cast(tf.square(Ns - Ts), tf.float32) / (2 * tf.square(g)))) nearly_diagonal_constraint = tf.multiply( self.alignments, tf.tile(tf.expand_dims(W, 0), [batch_size, 1, 1])) self.guided_attn_loss = tf.reduce_mean(nearly_diagonal_constraint) # Decoder2 or postprocessing self.z_hat = decoder2( self.y_hat, is_training=is_training) # (N, T_y//r, (1+n_fft//2)*r) # monitor self.audio_h = tf.py_func(spectrogram2wav, [self.z_hat[0]], tf.float32) if mode == "train": self.audio_gt = tf.py_func(spectrogram2wav, [self.z[0]], tf.float32) if mode in ("train", "eval"): # Loss self.loss1 = tf.reduce_mean(tf.abs(self.y_hat - self.y)) self.loss2 = tf.reduce_mean(tf.abs(self.z_hat - self.z)) if hp.guided_attention: self.loss = self.loss1 + self.loss2 + self.guided_attn_loss else: self.loss = self.loss1 + self.loss2 # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.lr = learning_rate_decay(hp.lr, global_step=self.global_step) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_norm(grad, 5.) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary if hp.guided_attention: tf.summary.scalar('{}/guided_attention_loss'.format(mode), self.guided_attn_loss) tf.summary.scalar('{}/loss1'.format(mode), self.loss1) tf.summary.scalar('{}/loss2'.format(mode), self.loss2) tf.summary.scalar('{}/loss'.format(mode), self.loss) tf.summary.scalar('{}/lr'.format(mode), self.lr) tf.summary.image("{}/mel_gt".format(mode), tf.expand_dims(self.y, -1), max_outputs=1) tf.summary.image("{}/mel_hat".format(mode), tf.expand_dims(self.y_hat, -1), max_outputs=1) tf.summary.image("{}/mag_gt".format(mode), tf.expand_dims(self.z, -1), max_outputs=1) tf.summary.image("{}/mag_hat".format(mode), tf.expand_dims(self.z_hat, -1), max_outputs=1) tf.summary.image("{}/attention".format(mode), tf.expand_dims(self.alignments, -1), max_outputs=1) tf.summary.audio("{}/sample_hat".format(mode), tf.expand_dims(self.audio_h, 0), hp.sr) tf.summary.audio("{}/sample_gt".format(mode), tf.expand_dims(self.audio_gt, 0), hp.sr) self.merged = tf.summary.merge_all()
def __init__(self, mode="train"): ''' Args: mode: Either "train" or "eval". ''' # Set flag training = True if mode=="train" else False # Graph # Data Feeding ## x: Quantized wav. (B, T, 1) int32 ## wavs: Raw wav. (B, length) float32 ## speakers: Speaker ids. (B,). [0, 108]. int32. if mode=="train": self.x, self.wavs, self.speaker_ids, self.num_batch = get_batch() self.y = self.x else: # test self.x = tf.placeholder(tf.int32, shape=(2, 63488, 1)) self.y = tf.placeholder(tf.int32, shape=(2, 63488, 1)) self.speaker_ids = tf.placeholder(tf.int32, shape=(2,)) # inputs: self.encoder_inputs = tf.to_float(self.x) self.decoder_inputs = tf.to_float(self.y) self.decoder_inputs = tf.concat((tf.zeros_like(self.decoder_inputs[:, :1, :]), self.decoder_inputs[:, :-1, :]), 1) # speaker embedding self.speakers = tf.one_hot(self.speaker_ids, len(hp.speakers)) # (B, len(speakers)) # encoder self.z_e = encoder(self.encoder_inputs) # (B, T', D) # vq self.z_q = vq(self.z_e) # (B, T', D) # decoder: y -> reconstructed logits. self.y_logits = decoder(self.decoder_inputs, self.speakers, self.z_q) # (B, T, Q) self.y_hat = tf.argmax(self.y_logits, -1) # (B, T) # monitor self.sample0 = tf.py_func(mu_law_decode, [self.y_hat[0]], tf.float32) self.sample1 = tf.py_func(mu_law_decode, [self.y_hat[1]], tf.float32) # speech samples # tf.summary.audio('{}/original1'.format(mode), self.wavs[:1], hp.sr, 1) # tf.summary.audio('{}/original2'.format(mode), self.wavs[1:], hp.sr, 1) tf.summary.audio('{}/sample0'.format(mode), tf.expand_dims(self.sample0, 0), hp.sr, 1) tf.summary.audio('{}/sample1'.format(mode), tf.expand_dims(self.sample1, 0), hp.sr, 1) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: self.dec_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.y_logits, labels=tf.squeeze(self.y))) self.vq_loss = tf.reduce_mean(tf.squared_difference(tf.stop_gradient(self.z_e), self.z_q)) self.enc_loss = hp.beta * tf.reduce_mean(tf.squared_difference(self.z_e, tf.stop_gradient(self.z_q))) # decoder grads decoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "decoder") decoder_grads = tf.gradients(self.dec_loss, decoder_vars) decoder_grads_vars = list(zip(decoder_grads, decoder_vars)) # embedding variables grads embed_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "vq") embed_grads = tf.gradients(self.dec_loss + self.vq_loss, embed_vars) embed_grads_vars = list(zip(embed_grads, embed_vars)) # encoder grads encoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoder") transferred_grads = tf.gradients(self.dec_loss, self.z_q) encoder_grads = [tf.gradients(self.z_e, var, transferred_grads)[0] + tf.gradients(self.enc_loss, var)[0] for var in encoder_vars] encoder_grads_vars = list(zip(encoder_grads, encoder_vars)) # total grads self.grads_vars = decoder_grads_vars + embed_grads_vars + encoder_grads_vars # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) # Summary tf.summary.scalar('train/dec_loss', self.dec_loss) tf.summary.scalar('train/vq_loss', self.vq_loss) tf.summary.scalar('train/enc_loss', self.enc_loss) # tf.summary.scalar("lr", self.lr) # gradient clipping self.clipped = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in self.grads_vars] with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step) # Summary self.merged = tf.summary.merge_all()
def build_enc_dec_connection(observation, constants): mean, logstd = encoder(observation, constants) eps = tf.random_normal(tf.shape(mean)) non_sampled_z = mean + tf.exp(logstd) * eps dec_out = decoder(non_sampled_z) return (mean, logstd), dec_out
def __init__(self, training=True): # Load vocabulary self.char2idx, self.idx2char = load_vocab() # Graph self.graph = tf.Graph() with self.graph.as_default(): # Data Feeding ## x: Text. (N, Tx), int32 ## y1: Melspectrogram. (N, Ty, n_mels) float32 ## y2: Dones. (N, Ty) int32 ## z: Magnitude. (N, Ty, n_fft//2+1) float32 if training: self.x, self.y1, self.y2, self.z = get_batch() self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, hp.batch_size), dtype=tf.int32) else: # Inference self.x = tf.placeholder(tf.int32, shape=(hp.batch_size, hp.Tx)) self.y1 = tf.placeholder(tf.float32, shape=(hp.batch_size, hp.Ty // hp.r, hp.n_mels * hp.r)) self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=( hp.dec_layers, hp.batch_size, )) # Get decoder inputs: feed last frames only (N, Ty, n_mels) self.decoder_input = tf.concat((tf.zeros_like( self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks with tf.variable_scope("encoder"): self.keys, self.vals = encoder(self.x, training=training) # (N, Tx, e) with tf.variable_scope("decoder"): # mel_logits: (N, Ty, n_mels) # done_output: (N, Ty, 2), # decoder_output: (N, Ty, e) # alignments_li: dec_layers*(Tx, Ty) # max_attentions_li: dec_layers*(N, T_y) self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li = decoder( self.decoder_input, self.keys, self.vals, self.prev_max_attentions_li, training=training) self.mel_output = tf.nn.sigmoid(self.mel_logits) with tf.variable_scope("converter"): # Restore shape self.converter_input = tf.reshape(self.decoder_output, (-1, hp.Ty, hp.embed_size)) self.converter_input = fc_block( self.converter_input, hp.converter_channels, activation_fn=tf.nn.relu, training=training) # (N, Ty, v) # Converter self.mag_logits = converter( self.converter_input, training=training) # (N, Ty, 1+n_fft//2) self.mag_output = tf.nn.sigmoid(self.mag_logits) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: # Loss self.loss_mels = tf.reduce_mean( tf.abs(self.mel_output - self.y1)) self.loss_dones = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.done_output, labels=self.y2)) self.loss_mags = tf.reduce_mean( tf.abs(self.mag_output - self.z)) self.loss = self.loss_mels + self.loss_dones + self.loss_mags # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_value(grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = tf.clip_by_norm(grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('Train_Loss/LOSS', self.loss) tf.summary.scalar('Train_Loss/mels', self.loss_mels) tf.summary.scalar('Train_Loss/dones', self.loss_dones) tf.summary.scalar('Train_Loss/mags', self.loss_mags) self.merged = tf.summary.merge_all()
def __init__(self, config=None, training=True): # Load vocabulary self.char2idx, self.idx2char = load_vocab() self.graph = tf.Graph() with self.graph.as_default(): # Data Feeding ## x: Text. (N, T_x), int32 ## y1: Reduced melspectrogram. (N, T_y//r, n_mels*r) float32 ## y2: Reduced dones. (N, T_y//r,) int32 ## z: Magnitude. (N, T_y, n_fft//2+1) float32 if training: self.origx, self.x, self.y1, self.y2, self.y3, self.num_batch = get_batch( config) #self.origx, self.x, self.y1, self.y3, self.num_batch = get_batch(config) self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, hp.batch_size), dtype=tf.int32) else: # Evaluation self.x = tf.placeholder(tf.int32, shape=(1, hp.T_x)) self.y1 = tf.placeholder(tf.float32, shape=(1, hp.T_y // hp.r, hp.n_mels * hp.r)) self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=( hp.dec_layers, 1, )) # Get decoder inputs: feed last frames only (N, Ty//r, n_mels) self.decoder_input = tf.concat((tf.zeros_like( self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks with tf.variable_scope("encoder"): self.keys, self.vals = encoder(self.x, training=training) # (N, Tx, e) with tf.variable_scope("decoder"): #self.mel_logits, self.decoder_output, self.alignments_li, self.max_attentions_li \ self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li \ = decoder(self.decoder_input, self.keys, self.vals, self.prev_max_attentions_li, training=training) self.mel_output = tf.nn.sigmoid(self.mel_logits) with tf.variable_scope("converter"): # Restore shape self.converter_input = tf.reshape( self.decoder_output, (-1, hp.T_y, hp.embed_size // hp.r)) self.converter_input = fc_block( self.converter_input, hp.converter_channels, activation_fn=tf.nn.relu, training=training) # (N, Ty, v) # Converter #self.mag_logits = converter(self.converter_input, training=training) # self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels)) self.mag_logits = converter(self.converter_input, training=training) self.mag_output = tf.nn.sigmoid(self.mag_logits) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: # Loss self.loss1 = tf.reduce_mean(tf.abs(self.mel_output - self.y1)) self.loss2 = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.done_output, labels=self.y2)) self.loss3 = tf.reduce_mean(tf.abs(self.mag_output - self.y3)) self.loss = self.loss1 + self.loss2 + self.loss3 #self.loss = self.loss1 + self.loss3 # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = grad if grad is None else tf.clip_by_value( grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = grad if grad is None else tf.clip_by_norm( grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary tf.summary.histogram('mel_output', self.mel_output) tf.summary.histogram('mel_actual', self.y1) tf.summary.histogram('done_output', self.done_output) tf.summary.histogram('done_actual', self.y2) tf.summary.histogram('mag_output', self.mag_output) tf.summary.histogram('mag_actual', self.y3) tf.summary.scalar('loss', self.loss) tf.summary.scalar('loss1', self.loss1) tf.summary.scalar('loss2', self.loss2) tf.summary.scalar('loss3', self.loss3) self.merged = tf.summary.merge_all()
def __init__(self, mode="train"): training = True if mode == "train" else False self.x = tf.placeholder(tf.int32, shape=[hp.batch_size, hp.T, 1]) self.y = self.x self.encoder_inputs = tf.one_hot(tf.squeeze(self.x, axis=-1), hp.Q, dtype=tf.float32) self.speaker_id = tf.placeholder(tf.int32, shape=[ hp.batch_size, ]) self.speakers = tf.one_hot(self.speaker_id, len(hp.speakers), dtype=tf.float32) # encoder self.z_e = encoder(self.encoder_inputs) # (B, T', D) # vq self.z_q = vq(self.z_e) # (B, T', D) # decoder: y -> reconstructed logits. self.y_logits = decoder(self.encoder_inputs, self.speakers, self.z_q) # (B, T-receptivefield+1, Q) # monitor # self.sample0 = tf.py_func(mu_law_decode, [self.y_hat[0]], tf.float32) # self.sample1 = tf.py_func(mu_law_decode, [self.y_hat[1]], tf.float32) # speech samples # tf.summary.audio('{}/original1'.format(mode), self.wavs[:1], hp.sr, 1) # tf.summary.audio('{}/original2'.format(mode), self.wavs[1:], hp.sr, 1) # tf.summary.audio('{}/sample0'.format(mode), tf.expand_dims(self.sample0, 0), hp.sr, 1) # tf.summary.audio('{}/sample1'.format(mode), tf.expand_dims(self.sample1, 0), hp.sr, 1) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: self.y = tf.slice(self.y, [0, hp.dilations[-1] * hp.size - 1, 0], [-1, -1, -1]) self.y = tf.squeeze(self.y, axis=2) self.dec_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.y_logits, labels=self.y)) self.vq_loss = tf.reduce_mean( tf.squared_difference(tf.stop_gradient(self.z_e), self.z_q)) self.enc_loss = hp.beta * tf.reduce_mean( tf.squared_difference(self.z_e, tf.stop_gradient(self.z_q))) # decoder grads decoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "decoder") decoder_grads = tf.gradients(self.dec_loss, decoder_vars) decoder_grads_vars = list(zip(decoder_grads, decoder_vars)) # embedding variables grads embed_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "vq") embed_grads = tf.gradients(self.vq_loss, embed_vars) embed_grads_vars = list(zip(embed_grads, embed_vars)) # encoder grads encoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoder") transferred_grads = tf.gradients(self.dec_loss, self.z_q) encoder_grads = [ tf.gradients(self.z_e, var, transferred_grads)[0] + tf.gradients(self.enc_loss, var)[0] for var in encoder_vars ] encoder_grads_vars = list(zip(encoder_grads, encoder_vars)) # total grads grads_vars = decoder_grads_vars + embed_grads_vars + encoder_grads_vars # Training Scheme optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) # Summary tf.summary.scalar('train/dec_loss', self.dec_loss) tf.summary.scalar('train/vq_loss', self.vq_loss) tf.summary.scalar('train/enc_loss', self.enc_loss) # tf.summary.scalar("lr", self.lr) # gradient clipping for grad, var in grads_vars: if grad is not None: self.clipped = [(tf.clip_by_value(grad, -1., 1.), var)] with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): self.train_op = optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary self.merged = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(hp.logdir + '/train') if training == False: with tf.variable_scope('decoder'): self.z_q = transposed_conv(self.z_q)
def __init__(self, config=None, training=True, train_form='Both'): # Load vocabulary self.char2idx, self.idx2char = load_vocab() self.graph = tf.Graph() with self.graph.as_default(): if training: self.origx, self.x, self.y1, self.y2, self.y3, self.num_batch = get_batch( config, train_form) self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, self.num_batch), dtype=tf.int32) else: # Evaluation self.x = tf.placeholder(tf.int32, shape=(1, hp.T_x)) self.y1 = tf.placeholder(tf.float32, shape=(1, hp.T_y // hp.r, hp.n_mels * hp.r)) self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=( hp.dec_layers, 1, )) # Get decoder inputs: feed last frames only if train_form != 'Converter': self.decoder_input = tf.concat( (tf.zeros_like(self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks if train_form != 'Converter': with tf.variable_scope("encoder"): self.encoded = encoder(self.x, training=training) with tf.variable_scope("decoder"): self.mel_logits, self.done_output, self.max_attentions_li = decoder( self.decoder_input, self.encoded, self.prev_max_attentions_li, training=training) #self.mel_output = self.mel_logits self.mel_output = tf.nn.sigmoid(self.mel_logits) if train_form == 'Both': with tf.variable_scope("converter"): #self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels)) self.converter_input = self.mel_output self.mag_logits = converter(self.converter_input, training=training) self.mag_output = tf.nn.sigmoid(self.mag_logits) elif train_form == 'Converter': with tf.variable_scope("converter"): #self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels)) self.converter_input = self.y1 self.mag_logits = converter(self.converter_input, training=training) self.mag_output = tf.nn.sigmoid(self.mag_logits) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: # Loss if train_form != 'Converter': self.loss1 = tf.reduce_mean( tf.abs(self.mel_output - self.y1)) if hp.include_dones: self.loss2 = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.done_output, labels=self.y2)) if train_form != 'Encoder': self.loss3 = tf.reduce_mean( tf.abs(self.mag_output - self.y3)) if train_form == 'Both': if hp.include_dones: self.loss = self.loss1 + self.loss2 + self.loss3 else: self.loss = self.loss1 + self.loss3 elif train_form == 'Encoder': if hp.include_dones: self.loss = self.loss1 + self.loss2 else: self.loss = self.loss1 else: self.loss = self.loss3 # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = grad if grad is None else tf.clip_by_value( grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = grad if grad is None else tf.clip_by_norm( grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('loss', self.loss) if train_form != 'Converter': tf.summary.histogram('mel_output', self.mel_output) tf.summary.histogram('mel_actual', self.y1) tf.summary.scalar('loss1', self.loss1) if hp.include_dones: tf.summary.histogram('done_output', self.done_output) tf.summary.histogram('done_actual', self.y2) tf.summary.scalar('loss2', self.loss2) if train_form != 'Encoder': tf.summary.histogram('mag_output', self.mag_output) tf.summary.histogram('mag_actual', self.y3) tf.summary.scalar('loss3', self.loss3) self.merged = tf.summary.merge_all()
def __init__(self, training=True): # Load vocabulary self.char2idx, self.idx2char = load_vocab() # Graph self.graph = tf.Graph() with self.graph.as_default(): # Data Feeding ## x: Text. (N, Tx), int32 ## y1: Reduced melspectrogram. (N, Ty//r, n_mels*r) float32 ## y2: Reduced dones. (N, Ty//r,) int32 ## z: Magnitude. (N, Ty, n_fft//2+1) float32 if training: self.x, self.y1, self.y2, self.z, self.num_batch = get_batch() self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, hp.batch_size), dtype=tf.int32) else: # Inference self.x = tf.placeholder(tf.int32, shape=(hp.batch_size, hp.Tx)) self.y1 = tf.placeholder(tf.float32, shape=(hp.batch_size, hp.Ty//hp.r, hp.n_mels*hp.r)) self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=(hp.dec_layers, hp.batch_size,)) # Get decoder inputs: feed last frames only (N, Ty//r, n_mels) self.decoder_input = tf.concat((tf.zeros_like(self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks with tf.variable_scope("encoder"): self.keys, self.vals = encoder(self.x, training=training) # (N, Tx, e) with tf.variable_scope("decoder"): # mel_logits: (N, Ty/r, n_mels*r) # done_output: (N, Ty/r, 2), # decoder_output: (N, Ty/r, e) # alignments_li: dec_layers*(Tx, Ty/r) # max_attentions_li: dec_layers*(N, T_y/r) self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li \ = decoder(self.decoder_input, self.keys, self.vals, self.prev_max_attentions_li, training=training) self.mel_output = tf.nn.sigmoid(self.mel_logits) with tf.variable_scope("converter"): # Restore shape self.converter_input = tf.reshape(self.decoder_output, (-1, hp.Ty, hp.embed_size//hp.r)) self.converter_input = fc_block(self.converter_input, hp.converter_channels, activation_fn=tf.nn.relu, training=training) # (N, Ty, v) # Converter self.mag_logits = converter(self.converter_input, training=training) # (N, Ty, 1+n_fft//2) self.mag_output = tf.nn.sigmoid(self.mag_logits) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: # Loss self.loss_mels = tf.reduce_mean(tf.abs(self.mel_output - self.y1)) self.loss_dones = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.done_output, labels=self.y2)) self.loss_mags = tf.reduce_mean(tf.abs(self.mag_output - self.z)) self.loss = self.loss_mels + self.loss_dones + self.loss_mags # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_value(grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = tf.clip_by_norm(grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('Train_Loss/LOSS', self.loss) tf.summary.scalar('Train_Loss/mels', self.loss_mels) tf.summary.scalar('Train_Loss/dones', self.loss_dones) tf.summary.scalar('Train_Loss/mags', self.loss_mags) self.merged = tf.summary.merge_all()
def __init__(self, mode="train"): # Load vocabulary self.char2idx, self.idx2char = load_vocab() # Set phase is_training = True if mode == "train" else False # Graph # Data Feeding # x: Text. (N, Tx) # y: Reduced melspectrogram. (N, Ty//r, n_mels*r) # z: Magnitude. (N, Ty, n_fft//2+1) if mode == "train": self.x, self.y, self.z, self.fnames, self.num_batch = get_batch() elif mode == "eval": self.x = tf.placeholder(tf.int32, shape=(None, None)) self.y = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels * hp.r)) self.z = tf.placeholder(tf.float32, shape=(None, None, 1 + hp.n_fft // 2)) self.fnames = tf.placeholder(tf.string, shape=(None, )) else: # Synthesize self.x = tf.placeholder(tf.int32, shape=(None, None)) self.y = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels * hp.r)) # Get encoder/decoder inputs self.encoder_inputs = embed(self.x, len(hp.vocab), hp.embed_size) # (N, T_x, E) self.decoder_inputs = tf.concat( (tf.zeros_like(self.y[:, :1, :]), self.y[:, :-1, :]), 1) # (N, Ty/r, n_mels*r) self.decoder_inputs = self.decoder_inputs[:, :, -hp. n_mels:] # feed last frames only (N, Ty/r, n_mels) # Networks with tf.variable_scope("net"): # Encoder self.memory = encoder(self.encoder_inputs, is_training=is_training) # (N, T_x, E) # Decoder1 self.y_hat, self.alignments = decoder1( self.decoder_inputs, self.memory, is_training=is_training) # (N, T_y//r, n_mels*r) # Decoder2 or postprocessing self.z_hat = decoder2( self.y_hat, is_training=is_training) # (N, T_y//r, (1+n_fft//2)*r) # monitor self.audio = tf.py_func(spectrogram2wav, [self.z_hat[0]], tf.float32) if mode in ("train", "eval"): # Loss self.loss1 = tf.reduce_mean(tf.abs(self.y_hat - self.y)) self.loss2 = tf.reduce_mean(tf.abs(self.z_hat - self.z)) self.loss = self.loss1 + self.loss2 # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.lr = learning_rate_decay(hp.lr, global_step=self.global_step) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_norm(grad, 5.) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('{}/loss1'.format(mode), self.loss1) tf.summary.scalar('{}/loss'.format(mode), self.loss) tf.summary.scalar('{}/lr'.format(mode), self.lr) tf.summary.image("{}/mel_gt".format(mode), tf.expand_dims(self.y, -1), max_outputs=1) tf.summary.image("{}/mel_hat".format(mode), tf.expand_dims(self.y_hat, -1), max_outputs=1) tf.summary.image("{}/mag_gt".format(mode), tf.expand_dims(self.z, -1), max_outputs=1) tf.summary.image("{}/mag_hat".format(mode), tf.expand_dims(self.z_hat, -1), max_outputs=1) tf.summary.audio("{}/sample".format(mode), tf.expand_dims(self.audio, 0), hp.sr) self.merged = tf.summary.merge_all()
features_quickdraw_dict = feature_gen( model=model_sketchy, loader=loader_quick_draw, dump_location=params.path_quickdraw_features) else: features_quickdraw_dict = pickle_load(params.path_quickdraw_features) print('quickdraw features file found. Loading completed') ## Generate a file containing glove vector corresponding to the classes being used. if (os.path.isfile(params.path_glove_vector) == False): generate_glove_vector() ## Load the z_encoder for both image and sketch. If saved model are not found, train and saved the model. if (not os.path.isfile(params.path_z_encoder_image)): z_encoder_image = encoder(in_dim=params.x_dim, z_dim=params.glove_dim) cuda(z_encoder_image) z_encoder_image = train_z_encoder( encoder_model=z_encoder_image, feature_dict=features_image_dict, dump_location=params.path_z_encoder_image) else: z_encoder_image = torch.load(params.path_z_encoder_image) cuda(z_encoder_image) if (not os.path.isfile(params.path_z_encoder_sketchy)): z_encoder_sketchy = encoder(in_dim=params.x_dim, z_dim=params.glove_dim) cuda(z_encoder_sketchy) z_encoder_sketchy = train_z_encoder(