def initialize_networks(args, device): # network En_A = networks.encoder(in_nc=args.in_ngc, nf=args.ngf, img_size=args.img_size).to(device) En_B = networks.encoder(in_nc=args.in_ngc, nf=args.ngf, img_size=args.img_size).to(device) De_A = networks.decoder(out_nc=args.out_ngc, nf=args.ngf).to(device) De_B = networks.decoder(out_nc=args.out_ngc, nf=args.ngf).to(device) Disc_A = networks.discriminator(in_nc=args.in_ndc, out_nc=args.out_ndc, nf=args.ndf, img_size=args.img_size).to(device) Disc_B = networks.discriminator(in_nc=args.in_ndc, out_nc=args.out_ndc, nf=args.ndf, img_size=args.img_size).to(device) print('---------- Networks initialized -------------') utils.print_network(En_A) utils.print_network(En_B) utils.print_network(De_A) utils.print_network(De_B) utils.print_network(Disc_A) utils.print_network(Disc_B) print('-----------------------------------------------') all_networks = [En_A, En_B, De_A, De_B, Disc_A, Disc_B] return all_networks
def convert_decoder_to_pb(X): features_to_use = 'Relu_{}_1'.format(X) pb_file_path = 'inference/decoder_{}.pb'.format(X) graph = tf.Graph() config = tf.ConfigProto() config.gpu_options.visible_device_list = GPU_TO_USE with graph.as_default(): features = tf.placeholder(dtype=tf.float32, name='features', shape=[None, None, None, NUM_FEATURES[X]]) restored_images = tf.identity(decoder(features, features_to_use), 'restored_images') saver = tf.train.Saver() with tf.Session(graph=graph, config=config) as sess: saver.restore(sess, CHECKPOINT[X]) # output ops keep_nodes = ['restored_images'] input_graph_def = tf.graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), output_node_names=keep_nodes) output_graph_def = tf.graph_util.remove_training_nodes( input_graph_def, protected_nodes=keep_nodes) with tf.gfile.GFile(pb_file_path, 'wb') as f: f.write(output_graph_def.SerializeToString()) print('%d ops in the final graph.' % len(output_graph_def.node))
def train_autoencoder(X_dir, Y_dir, batch_size, dim, X_channels, Y_channels, log_dir, shuffle, **kwargs): # Dataset pairs_filename = load_dataset(X_dir, Y_dir) partition = partition_dataset(pairs_filename) # Generators training_generator = DataGenerator(partition['train'], batch_size, dim, X_channels, Y_channels, shuffle) validation_generator = DataGenerator(partition['validation'], batch_size, dim, X_channels, Y_channels, shuffle) # Design model input_img = Input(shape=(*dim, X_channels)) encoder_img = encoder(n_features=8) decoder_lbl = decoder(n_output_features=Y_channels, n_features=8) latent_img = encoder_img(input_img) latent_lbl = latent_img # TODO Put res_net here for image to label translation restored_lbl = decoder_lbl(latent_lbl) img2lbl = Model(input_img, restored_lbl) img2lbl.compile(optimizer='adadelta', loss='mean_squared_error') # Print summary img2lbl.summary() print('Model contains a total of %d trainable layers.\n' % len(img2lbl.trainable_weights)) # Train model tbi_callback = TensorBoardImage(log_dir=log_dir, validation_data=validation_generator) tb_callback = TensorBoard(log_dir=log_dir) img2lbl.fit_generator(generator=training_generator, validation_data=validation_generator, epochs=50, callbacks=[tb_callback, tbi_callback], use_multiprocessing=True, workers=2)
def generator(speaker_embedding, inputs, is_training=True, scope_name='generator', reuse=None): '''Generate features. Args: speaker_embedding: A `Tensor` with type `float32` contains speaker information. [N, E] inputs: A `Tensor` with type `float32` contains speech features. is_training: Boolean, whether to train or inference. scope_name: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: A decoded `Tensor` with aim speaker. vae mu vector. vae log_var vector. ''' with tf.variable_scope(scope_name, reuse=reuse): sample, mu, log_var = encoder(inputs, is_training=is_training, scope='vae_encoder') # [N, T, E] #speaker_embedding = tf.expand_dims(speaker_embedding, axis=1) # [N, 1, E] speaker_embedding = tf.tile(speaker_embedding, [1, tf.shape(sample)[1], 1]) # [N, T, E] encoded = tf.concat((speaker_embedding, sample), axis=-1) # [N, T, E+G] outputs = decoder(encoded, is_training=is_training, scope='vae_decoder') return outputs, mu, log_var # [N, T, C]
def sample_x_from_prior(self, noise): sample_x, _ = decoder(self.opts, input=noise, output_dim=self.output_dim, scope='decoder', reuse=True, is_training=False) return sample_x
def generator(inputs, is_training=True, scope_name='generator', reuse=None): with tf.variable_scope(scope_name, reuse=reuse): sample, mu, log_var = encoder(inputs, is_training=is_training, scope='vae_encoder') # [N, T, E] # speaker_embedding = tf.tile(speaker_embedding, [1, tf.shape(sample)[1], 1]) # [N, T, E] # tf.tile() 用来对张量(Tensor)进行扩展的,表示每一维度,拓展复制几次; # encoded = tf.concat((speaker_embedding, sample), axis=-1) # [N, T, E+G] outputs = decoder(sample, is_training=is_training, scope='vae_decoder') return outputs, mu, log_var # [N, T, C]
def forward_pass(self, inputs, is_training, reuse=False): enc_z, enc_mean, enc_Sigma = encoder(self.opts, input=inputs, output_dim=2 * self.opts['zdim'], scope='encoder', reuse=reuse, is_training=is_training) dec_x, dec_mean = decoder(self.opts, input=enc_z, output_dim=self.output_dim, scope='decoder', reuse=reuse, is_training=is_training) return enc_z, enc_mean, enc_Sigma, dec_x, dec_mean
def forward_pass(self, inputs, is_training, reuse=False): """Performs a full pass over the model. inputs: [batch,imgdim] return: enc_cat_logits: [batch,K] enc_z/enc_gauss_mean/enc_gauss_Sigma: [batch,K,zdim] dec_mean, dec_Sigma: [batch,K,imgdim] """ # Encode enc_cat_logits, enc_gauss_mean, enc_gauss_Sigma = encoder( self.opts, input=inputs, cat_output_dim=self.opts['nmixtures'], gaus_output_dim=2 * self.opts['zdim'], scope='encoder', reuse=reuse, is_training=is_training) enc_gauss_mean = tf.reshape( enc_gauss_mean, [-1, self.opts['nmixtures'], self.opts['zdim']]) enc_gauss_Sigma = tf.reshape( enc_gauss_Sigma, [-1, self.opts['nmixtures'], self.opts['zdim']]) enc_z = sample_all_gmm(self.opts, enc_gauss_mean, enc_gauss_Sigma) #[batch,nmixtures,zdim] enc_z_flat = tf.reshape(enc_z, [-1, self.opts['zdim']]) # Decode dec_mean, dec_Sigma = decoder(self.opts, input=enc_z_flat, output_dim=self.output_dim, scope='decoder', reuse=reuse, is_training=is_training) outshape = [ -1, self.opts['nmixtures'], np.prod(datashapes[self.opts['dataset']]) ] dec_mean = tf.reshape(dec_mean, outshape) dec_Sigma = tf.reshape(dec_Sigma, outshape) return enc_cat_logits, enc_z, enc_gauss_mean, enc_gauss_Sigma, dec_mean, dec_Sigma
def __init__(self, options): super(ArtGAN, self).__init__() # build model self.encoder = encoder(options) self.decoder = decoder(options) self.discriminator = discriminator(options) self.discriminator_weight = { "pred_1": 1., "pred_2": 1., "pred_4": 1., "pred_6": 1., "pred_7": 1. } self.loss = nn.BCEWithLogitsLoss(reduction='mean') self.mse = nn.MSELoss(reduction='mean') self.abs = nn.L1Loss(reduction='mean') # Setup the optimizers dis_params = list(self.discriminator.parameters()) gen_params = list(self.encoder.parameters()) + list( self.decoder.parameters()) self.dis_opt = torch.optim.Adam( [p for p in dis_params if p.requires_grad], lr=options.lr, betas=(0.5, 0.999), weight_decay=0.0001, amsgrad=True) self.gen_opt = torch.optim.Adam( [p for p in gen_params if p.requires_grad], lr=options.lr, betas=(0.5, 0.999), weight_decay=0.0001, amsgrad=True) self.dis_scheduler = get_scheduler(self.dis_opt, options) self.gen_scheduler = get_scheduler(self.gen_opt, options) # Network weight initialization self.apply(weights_init(options.init)) self.discriminator.apply(weights_init('gaussian')) self.gener_loss = torch.tensor(0.) self.discr_loss = torch.tensor(0.)
def fnet(self, mel, is_training=True, reuse=None): prenet_out = prenet(mel, num_units=[hp.hidden_units, hp.hidden_units // 2], dropout_rate=hp.dropout_rate, is_training=is_training, reuse=reuse) # (N, T, E/2) # CBHG1: mel-scale out, _ = cbhg(prenet_out, hp.num_banks, hp.hidden_units // 2, hp.num_highway_blocks, hp.norm_type, is_training, scope="fnet_cbhg", reuse=reuse) out, _, _, _, _, _ = networks.decoder(self.x_mel, out, training=is_training) mid = out # Final linear projection logits = tf.layers.dense(out, hp.len_chinese_ppgs, trainable=is_training, reuse=reuse) # (N, T, V) ppgs = tf.nn.softmax(logits / hp.t, name='ppgs') # (N, T, V) preds = tf.to_int32(tf.argmax(logits, axis=-1)) # (N, T) decoded = tf.transpose(logits, perm=[1, 0, 2]) sequence_len = tf.reduce_sum(tf.cast( tf.not_equal(tf.reduce_sum(mel, reduction_indices=2), 0.), tf.int32), reduction_indices=1) decoded, _ = tf.nn.ctc_beam_search_decoder(decoded, sequence_len, merge_repeated=False) decoded = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) return mid, logits, ppgs, preds, decoded
def sample_x_from_prior(self, noise): """ Sample is taken to be the mean parameters of the decoder. In the case of WAE, this correspond to determinitic decoder, for VAE, discrepency between decoder and samples as we consider the mean param as the samples from the model noise: [batch,K,zdim] return: sample_x: [batch,K,imgdim] """ sample_x, _, = decoder(self.opts, input=noise, output_dim=self.output_dim, scope='decoder', reuse=True, is_training=False) output_shape = [ -1, ] + datashapes[self.opts['dataset']] return tf.reshape(sample_x, output_shape)
def __init__(self, config=None, training=True): # Load vocabulary self.char2idx, self.idx2char = load_vocab() self.graph = tf.Graph() with self.graph.as_default(): # Data Feeding ## x: Text. (N, T_x), int32 ## y1: Reduced melspectrogram. (N, T_y//r, n_mels*r) float32 ## y2: Reduced dones. (N, T_y//r,) int32 ## z: Magnitude. (N, T_y, n_fft//2+1) float32 if training: self.origx, self.x, self.y1, self.y2, self.y3, self.num_batch = get_batch( config) #self.origx, self.x, self.y1, self.y3, self.num_batch = get_batch(config) self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, hp.batch_size), dtype=tf.int32) else: # Evaluation self.x = tf.placeholder(tf.int32, shape=(1, hp.T_x)) self.y1 = tf.placeholder(tf.float32, shape=(1, hp.T_y // hp.r, hp.n_mels * hp.r)) self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=( hp.dec_layers, 1, )) # Get decoder inputs: feed last frames only (N, Ty//r, n_mels) self.decoder_input = tf.concat((tf.zeros_like( self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks with tf.variable_scope("encoder"): self.keys, self.vals = encoder(self.x, training=training) # (N, Tx, e) with tf.variable_scope("decoder"): #self.mel_logits, self.decoder_output, self.alignments_li, self.max_attentions_li \ self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li \ = decoder(self.decoder_input, self.keys, self.vals, self.prev_max_attentions_li, training=training) self.mel_output = tf.nn.sigmoid(self.mel_logits) with tf.variable_scope("converter"): # Restore shape self.converter_input = tf.reshape( self.decoder_output, (-1, hp.T_y, hp.embed_size // hp.r)) self.converter_input = fc_block( self.converter_input, hp.converter_channels, activation_fn=tf.nn.relu, training=training) # (N, Ty, v) # Converter #self.mag_logits = converter(self.converter_input, training=training) # self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels)) self.mag_logits = converter(self.converter_input, training=training) self.mag_output = tf.nn.sigmoid(self.mag_logits) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: # Loss self.loss1 = tf.reduce_mean(tf.abs(self.mel_output - self.y1)) self.loss2 = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.done_output, labels=self.y2)) self.loss3 = tf.reduce_mean(tf.abs(self.mag_output - self.y3)) self.loss = self.loss1 + self.loss2 + self.loss3 #self.loss = self.loss1 + self.loss3 # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = grad if grad is None else tf.clip_by_value( grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = grad if grad is None else tf.clip_by_norm( grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary tf.summary.histogram('mel_output', self.mel_output) tf.summary.histogram('mel_actual', self.y1) tf.summary.histogram('done_output', self.done_output) tf.summary.histogram('done_actual', self.y2) tf.summary.histogram('mag_output', self.mag_output) tf.summary.histogram('mag_actual', self.y3) tf.summary.scalar('loss', self.loss) tf.summary.scalar('loss1', self.loss1) tf.summary.scalar('loss2', self.loss2) tf.summary.scalar('loss3', self.loss3) self.merged = tf.summary.merge_all()
def build_enc_dec_connection(observation, constants): mean, logstd = encoder(observation, constants) eps = tf.random_normal(tf.shape(mean)) non_sampled_z = mean + tf.exp(logstd) * eps dec_out = decoder(non_sampled_z) return (mean, logstd), dec_out
def __init__(self, mode="train"): ''' Args: mode: Either "train" or "eval". ''' # Set flag training = True if mode=="train" else False # Graph # Data Feeding ## x: Quantized wav. (B, T, 1) int32 ## wavs: Raw wav. (B, length) float32 ## speakers: Speaker ids. (B,). [0, 108]. int32. if mode=="train": self.x, self.wavs, self.speaker_ids, self.num_batch = get_batch() self.y = self.x else: # test self.x = tf.placeholder(tf.int32, shape=(2, 63488, 1)) self.y = tf.placeholder(tf.int32, shape=(2, 63488, 1)) self.speaker_ids = tf.placeholder(tf.int32, shape=(2,)) # inputs: self.encoder_inputs = tf.to_float(self.x) self.decoder_inputs = tf.to_float(self.y) self.decoder_inputs = tf.concat((tf.zeros_like(self.decoder_inputs[:, :1, :]), self.decoder_inputs[:, :-1, :]), 1) # speaker embedding self.speakers = tf.one_hot(self.speaker_ids, len(hp.speakers)) # (B, len(speakers)) # encoder self.z_e = encoder(self.encoder_inputs) # (B, T', D) # vq self.z_q = vq(self.z_e) # (B, T', D) # decoder: y -> reconstructed logits. self.y_logits = decoder(self.decoder_inputs, self.speakers, self.z_q) # (B, T, Q) self.y_hat = tf.argmax(self.y_logits, -1) # (B, T) # monitor self.sample0 = tf.py_func(mu_law_decode, [self.y_hat[0]], tf.float32) self.sample1 = tf.py_func(mu_law_decode, [self.y_hat[1]], tf.float32) # speech samples # tf.summary.audio('{}/original1'.format(mode), self.wavs[:1], hp.sr, 1) # tf.summary.audio('{}/original2'.format(mode), self.wavs[1:], hp.sr, 1) tf.summary.audio('{}/sample0'.format(mode), tf.expand_dims(self.sample0, 0), hp.sr, 1) tf.summary.audio('{}/sample1'.format(mode), tf.expand_dims(self.sample1, 0), hp.sr, 1) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: self.dec_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.y_logits, labels=tf.squeeze(self.y))) self.vq_loss = tf.reduce_mean(tf.squared_difference(tf.stop_gradient(self.z_e), self.z_q)) self.enc_loss = hp.beta * tf.reduce_mean(tf.squared_difference(self.z_e, tf.stop_gradient(self.z_q))) # decoder grads decoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "decoder") decoder_grads = tf.gradients(self.dec_loss, decoder_vars) decoder_grads_vars = list(zip(decoder_grads, decoder_vars)) # embedding variables grads embed_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "vq") embed_grads = tf.gradients(self.dec_loss + self.vq_loss, embed_vars) embed_grads_vars = list(zip(embed_grads, embed_vars)) # encoder grads encoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoder") transferred_grads = tf.gradients(self.dec_loss, self.z_q) encoder_grads = [tf.gradients(self.z_e, var, transferred_grads)[0] + tf.gradients(self.enc_loss, var)[0] for var in encoder_vars] encoder_grads_vars = list(zip(encoder_grads, encoder_vars)) # total grads self.grads_vars = decoder_grads_vars + embed_grads_vars + encoder_grads_vars # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) # Summary tf.summary.scalar('train/dec_loss', self.dec_loss) tf.summary.scalar('train/vq_loss', self.vq_loss) tf.summary.scalar('train/enc_loss', self.enc_loss) # tf.summary.scalar("lr", self.lr) # gradient clipping self.clipped = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in self.grads_vars] with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step) # Summary self.merged = tf.summary.merge_all()
def __init__(self, mode="train"): """ Initialize the class based off of the given mode :param mode: the mode to load the model based on """ print("Loading your model...") # Initialize values used in class self.mode = mode self.global_step = None self.mel_loss = None self.mel_loss = None self.mag_loss = None self.learning_rate = None self.optimizer = None self.merged = None self.gradients = None self.clipped = None self.gvs = None self.opt_train = None # If is_training if mode == "train": self.is_training = True else: self.is_training = False print("Loading inputs...") # Load inputs if self.is_training: self.txt, self.mels, self.mags, self.file_names, self.num_batch = get_batch( ) elif mode == "synthesize": self.txt = tf.placeholder(tf.int32, shape=(None, None)) self.mels = tf.placeholder(tf.float32, shape=(None, None, N_MELS * REDUCTION_FACTOR)) else: # eval self.txt = tf.placeholder(tf.int32, shape=(None, None)) self.mels = tf.placeholder(tf.float32, shape=(None, None, N_MELS * REDUCTION_FACTOR)) self.mags = tf.placeholder(tf.float32, shape=(None, None, 1 + N_FFT // 2)) self.file_names = tf.placeholder(tf.string, shape=(None, )) # decoder inputs self.decoder_inputs = tf.concat( (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1) self.decoder_inputs = self.decoder_inputs[:, :, -N_MELS:] # Networks with tf.variable_scope("Networks"): print("Loading the encoder...") # encoder self.memory = encoder(self.txt, is_training=self.is_training) print("Loading the decoder...") # decoder self.mel_hat, self.alignments = decoder( self.decoder_inputs, self.memory, is_training=self.is_training) print("Loading the post CBHG module...") # CBHG Module self.mags_hat = cbhg_helper(self.mel_hat, N_MELS, is_training=self.is_training, post=True) print("Audio out") # audio self.audio_out = tf.py_func(spectrogram2wav, [self.mags_hat[0]], tf.float32) # Training and evaluation if mode in ("train", "eval"): print("Generating Loss...") # Loss self.loss = self.get_loss() print("Getting the optimizer ready...") # Training Scheme self.optimize() print("Setting up your summary...") self.summarize()
def __init__(self, training=True): # Load vocabulary self.char2idx, self.idx2char = load_vocab() self.graph = tf.Graph() with self.graph.as_default(): # Data Feeding ## x: Text. (N, T_x), int32 ## y1: Reduced melspectrogram. (N, T_y//r, n_mels*r) float32 ## y2: Reduced dones. (N, T_y//r,) int32 ## z: Magnitude. (N, T_y, n_fft//2+1) float32 if training: self.x, self.y1, self.y2, self.z, self.num_batch = get_batch() self.prev_max_attentions = tf.constant([0] * hp.batch_size) else: # Evaluation self.x = tf.placeholder(tf.int32, shape=(hp.batch_size, hp.T_x)) self.y1 = tf.placeholder(tf.float32, shape=(hp.batch_size, hp.T_y // hp.r, hp.n_mels * hp.r)) self.prev_max_attentions = tf.placeholder( tf.int32, shape=(hp.batch_size, )) # Get decoder inputs: feed last frames only (N, T_y//r, n_mels) self.decoder_input = tf.concat((tf.zeros_like( self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks with tf.variable_scope("net"): # Encoder. keys: (N, T_x, e), vals: (N, T_x, e) self.keys, self.vals, self.masks = encoder(self.x, training=training, scope="encoder") # Decoder. mel_output: (N, T_y/r, n_mels*r), done_output: (N, T_y/r, 2), # decoder_output: (N, T_y/r, e), alignments: (N, T_y, T_x) self.mel_output, self.done_output, self.decoder_output, self.alignments, self.max_attentions = decoder( self.decoder_input, self.keys, self.vals, self.masks, self.prev_max_attentions, training=training, scope="decoder", reuse=None) # Restore shape. converter_input: (N, T_y, e/r) self.converter_input = tf.reshape(self.decoder_output, (hp.batch_size, hp.T_y, -1)) self.converter_input = normalize(self.converter_input, type=hp.norm_type, training=training, activation_fn=tf.nn.relu) # Converter. mag_output: (N, T_y, 1+n_fft//2) self.mag_output = converter(self.converter_input, training=training, scope="converter") if training: # Loss self.loss1_mae = tf.reduce_mean( tf.abs(self.mel_output - self.y1)) self.loss1_ce = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.done_output, labels=self.y2)) self.loss2 = tf.reduce_mean(tf.abs(self.mag_output - self.z)) self.loss = self.loss1_mae + self.loss1_ce + self.loss2 # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_value(grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = tf.clip_by_norm(grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('loss', self.loss) tf.summary.scalar('loss1_mae', self.loss1_mae) tf.summary.scalar('loss1_ce', self.loss1_ce) tf.summary.scalar('loss2', self.loss2) self.merged = tf.summary.merge_all()
]) train_loader_A = utils.data_load(os.path.join('data', args.dataset), 'trainA', transform, args.batch_size, shuffle=True, drop_last=True) train_loader_B = utils.data_load(os.path.join('data', args.dataset), 'trainB', transform, args.batch_size, shuffle=True, drop_last=True) test_loader_A = utils.data_load(os.path.join('data', args.dataset), 'testA', transform, 1, shuffle=True, drop_last=True) test_loader_B = utils.data_load(os.path.join('data', args.dataset), 'testB', transform, 1, shuffle=True, drop_last=True) print('------------ Datasets -------------') print('TrainA:', len(train_loader_A)) print('TrainB:', len(train_loader_B)) print('TestA:', len(test_loader_A)) print('TestB:', len(test_loader_B)) print('-------------- End ----------------') # network En_A = networks.encoder(in_nc=args.in_ngc, nf=args.ngf, img_size=args.img_size).to(device) En_B = networks.encoder(in_nc=args.in_ngc, nf=args.ngf, img_size=args.img_size).to(device) De_A = networks.decoder(out_nc=args.out_ngc, nf=args.ngf).to(device) De_B = networks.decoder(out_nc=args.out_ngc, nf=args.ngf).to(device) Disc_A = networks.discriminator(in_nc=args.in_ndc, out_nc=args.out_ndc, nf=args.ndf, img_size=args.img_size).to(device) Disc_B = networks.discriminator(in_nc=args.in_ndc, out_nc=args.out_ndc, nf=args.ndf, img_size=args.img_size).to(device) En_A.train() En_B.train() De_A.train() De_B.train() Disc_A.train() Disc_B.train() print('---------- Networks initialized -------------') utils.print_network(En_A) utils.print_network(En_B) utils.print_network(De_A) utils.print_network(De_B) utils.print_network(Disc_A)
def __init__(self, training=True): # Load vocabulary self.char2idx, self.idx2char = load_vocab() # Graph self.graph = tf.Graph() with self.graph.as_default(): # Data Feeding ## x: Text. (N, Tx), int32 ## y1: Reduced melspectrogram. (N, Ty//r, n_mels*r) float32 ## y2: Reduced dones. (N, Ty//r,) int32 ## z: Magnitude. (N, Ty, n_fft//2+1) float32 if training: self.x, self.y1, self.y2, self.z, self.num_batch = get_batch() self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, hp.batch_size), dtype=tf.int32) else: # Inference self.x = tf.placeholder(tf.int32, shape=(hp.batch_size, hp.Tx)) self.y1 = tf.placeholder(tf.float32, shape=(hp.batch_size, hp.Ty//hp.r, hp.n_mels*hp.r)) self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=(hp.dec_layers, hp.batch_size,)) # Get decoder inputs: feed last frames only (N, Ty//r, n_mels) self.decoder_input = tf.concat((tf.zeros_like(self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks with tf.variable_scope("encoder"): self.keys, self.vals = encoder(self.x, training=training) # (N, Tx, e) with tf.variable_scope("decoder"): # mel_logits: (N, Ty/r, n_mels*r) # done_output: (N, Ty/r, 2), # decoder_output: (N, Ty/r, e) # alignments_li: dec_layers*(Tx, Ty/r) # max_attentions_li: dec_layers*(N, T_y/r) self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li \ = decoder(self.decoder_input, self.keys, self.vals, self.prev_max_attentions_li, training=training) self.mel_output = tf.nn.sigmoid(self.mel_logits) with tf.variable_scope("converter"): # Restore shape self.converter_input = tf.reshape(self.decoder_output, (-1, hp.Ty, hp.embed_size//hp.r)) self.converter_input = fc_block(self.converter_input, hp.converter_channels, activation_fn=tf.nn.relu, training=training) # (N, Ty, v) # Converter self.mag_logits = converter(self.converter_input, training=training) # (N, Ty, 1+n_fft//2) self.mag_output = tf.nn.sigmoid(self.mag_logits) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: # Loss self.loss_mels = tf.reduce_mean(tf.abs(self.mel_output - self.y1)) self.loss_dones = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.done_output, labels=self.y2)) self.loss_mags = tf.reduce_mean(tf.abs(self.mag_output - self.z)) self.loss = self.loss_mels + self.loss_dones + self.loss_mags # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_value(grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = tf.clip_by_norm(grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('Train_Loss/LOSS', self.loss) tf.summary.scalar('Train_Loss/mels', self.loss_mels) tf.summary.scalar('Train_Loss/dones', self.loss_dones) tf.summary.scalar('Train_Loss/mags', self.loss_mags) self.merged = tf.summary.merge_all()
def __init__(self, training=True): # Load vocabulary self.char2idx, self.idx2char = load_vocab() # Graph self.graph = tf.Graph() with self.graph.as_default(): # Data Feeding ## x: Text. (N, Tx), int32 ## y1: Melspectrogram. (N, Ty, n_mels) float32 ## y2: Dones. (N, Ty) int32 ## z: Magnitude. (N, Ty, n_fft//2+1) float32 if training: self.x, self.y1, self.y2, self.z = get_batch() self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, hp.batch_size), dtype=tf.int32) else: # Inference self.x = tf.placeholder(tf.int32, shape=(hp.batch_size, hp.Tx)) self.y1 = tf.placeholder(tf.float32, shape=(hp.batch_size, hp.Ty // hp.r, hp.n_mels * hp.r)) self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=( hp.dec_layers, hp.batch_size, )) # Get decoder inputs: feed last frames only (N, Ty, n_mels) self.decoder_input = tf.concat((tf.zeros_like( self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks with tf.variable_scope("encoder"): self.keys, self.vals = encoder(self.x, training=training) # (N, Tx, e) with tf.variable_scope("decoder"): # mel_logits: (N, Ty, n_mels) # done_output: (N, Ty, 2), # decoder_output: (N, Ty, e) # alignments_li: dec_layers*(Tx, Ty) # max_attentions_li: dec_layers*(N, T_y) self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li = decoder( self.decoder_input, self.keys, self.vals, self.prev_max_attentions_li, training=training) self.mel_output = tf.nn.sigmoid(self.mel_logits) with tf.variable_scope("converter"): # Restore shape self.converter_input = tf.reshape(self.decoder_output, (-1, hp.Ty, hp.embed_size)) self.converter_input = fc_block( self.converter_input, hp.converter_channels, activation_fn=tf.nn.relu, training=training) # (N, Ty, v) # Converter self.mag_logits = converter( self.converter_input, training=training) # (N, Ty, 1+n_fft//2) self.mag_output = tf.nn.sigmoid(self.mag_logits) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: # Loss self.loss_mels = tf.reduce_mean( tf.abs(self.mel_output - self.y1)) self.loss_dones = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.done_output, labels=self.y2)) self.loss_mags = tf.reduce_mean( tf.abs(self.mag_output - self.z)) self.loss = self.loss_mels + self.loss_dones + self.loss_mags # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_value(grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = tf.clip_by_norm(grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('Train_Loss/LOSS', self.loss) tf.summary.scalar('Train_Loss/mels', self.loss_mels) tf.summary.scalar('Train_Loss/dones', self.loss_dones) tf.summary.scalar('Train_Loss/mags', self.loss_mags) self.merged = tf.summary.merge_all()
def create_networks(self): # Placeholders self.state = tf.placeholder(tf.float32, [None, self.l_state], 'state') self.obs = tf.placeholder(tf.float32, [None, self.l_obs], 'obs') self.z = tf.placeholder(tf.float32, [None, self.l_z], 'z') # Decoder p(z|tau) if self.obs_truncate_length: self.traj = tf.placeholder(dtype=tf.float32, shape=[ None, self.traj_length_downsampled, self.obs_truncate_length ]) else: self.traj = tf.placeholder( dtype=tf.float32, shape=[None, self.traj_length_downsampled, self.l_obs]) with tf.variable_scope("Decoder"): self.decoder_out, self.decoder_probs = networks.decoder( self.traj, self.traj_length_downsampled, self.nn['n_h_decoder'], self.l_z) # Low-level policy if self.low_level_alg == 'reinforce' or self.low_level_alg == 'iac': self.epsilon = tf.placeholder(tf.float32, None, 'epsilon') with tf.variable_scope("Policy_main"): probs = networks.actor(self.obs, self.z, self.nn['n_h1_low'], self.nn['n_h2_low'], self.l_action) self.probs = (1 - self.epsilon) * probs + self.epsilon / float( self.l_action) self.action_samples = tf.multinomial(tf.log(self.probs), 1) if self.low_level_alg == 'iac': with tf.variable_scope("V_main"): self.V = networks.critic(self.obs, self.z, self.nn['n_h1_low'], self.nn['n_h2_low']) with tf.variable_scope("V_target"): self.V_target = networks.critic(self.obs, self.z, self.nn['n_h1_low'], self.nn['n_h2_low']) # Low-level Q-functions if self.low_level_alg == 'iql': with tf.variable_scope("Qlow_main"): self.Q_low = networks.Q_low(self.obs, self.z, self.nn['n_h1_low'], self.nn['n_h2_low'], self.l_action) with tf.variable_scope("Qlow_target"): self.Q_low_target = networks.Q_low(self.obs, self.z, self.nn['n_h1_low'], self.nn['n_h2_low'], self.l_action) self.argmax_Q_low = tf.argmax(self.Q_low, axis=1) self.actions_low_1hot = tf.placeholder(tf.float32, [None, self.l_action], 'actions_low_1hot') # High-level QMIX # Individual agent networks # output dimension is [time * n_agents, q-values] with tf.variable_scope("Agent_main"): self.agent_qs = networks.Qmix_single(self.obs, self.nn['n_h1'], self.nn['n_h2'], self.l_z) with tf.variable_scope("Agent_target"): self.agent_qs_target = networks.Qmix_single( self.obs, self.nn['n_h1'], self.nn['n_h2'], self.l_z) self.argmax_Q = tf.argmax(self.agent_qs, axis=1) self.argmax_Q_target = tf.argmax(self.agent_qs_target, axis=1) # To extract Q-value from agent_qs and agent_qs_target # [batch*n_agents, N_roles] self.actions_1hot = tf.placeholder(tf.float32, [None, self.l_z], 'actions_1hot') # [batch*n_agents, 1] self.q_selected = tf.reduce_sum(tf.multiply(self.agent_qs, self.actions_1hot), axis=1) # [batch, n_agents] self.mixer_q_input = tf.reshape(self.q_selected, [-1, self.n_agents]) self.q_target_selected = tf.reduce_sum(tf.multiply( self.agent_qs_target, self.actions_1hot), axis=1) self.mixer_target_q_input = tf.reshape(self.q_target_selected, [-1, self.n_agents]) # Mixing network with tf.variable_scope("Mixer_main"): self.mixer = networks.Qmix_mixer(self.mixer_q_input, self.state, self.l_state, self.n_agents, self.nn['n_h_mixer']) with tf.variable_scope("Mixer_target"): self.mixer_target = networks.Qmix_mixer(self.mixer_target_q_input, self.state, self.l_state, self.n_agents, self.nn['n_h_mixer'])
def model_fn(features, labels, mode, params, config): """ This is a function for creating a computational tensorflow graph. The function is in format required by tf.estimator. """ images = features is_training = mode == tf.estimator.ModeKeys.TRAIN # build the main graph feature_to_use = params['feature_to_use'] # Relu_X_1 encoding = encoder(images)[feature_to_use] restored_images = decoder(encoding, feature_to_use) encoding_of_restored_images = encoder(restored_images)[feature_to_use] # use a pretrained backbone network if is_training: with tf.name_scope('init_from_checkpoint'): tf.train.init_from_checkpoint(params['pretrained_checkpoint'], {'vgg_19/': 'encoder/'}) assert mode != tf.estimator.ModeKeys.PREDICT # add L2 regularization with tf.name_scope('weight_decay'): add_weight_decay(params['weight_decay']) regularization_loss = tf.losses.get_regularization_loss() batch_size = tf.to_float(tf.shape(images)[0]) normalizer = 255.0 * batch_size reconstruction_loss = tf.nn.l2_loss(images - restored_images) / normalizer features_loss = tf.nn.l2_loss(encoding - encoding_of_restored_images) / normalizer tf.losses.add_loss(reconstruction_loss) tf.losses.add_loss(params['lambda'] * features_loss) tf.summary.scalar('regularization_loss', regularization_loss) tf.summary.scalar('reconstruction_loss', reconstruction_loss) tf.summary.scalar('features_loss', features_loss) total_loss = tf.losses.get_total_loss(add_regularization_losses=True) if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = { 'val_reconstruction_loss': tf.metrics.mean(reconstruction_loss), 'val_features_loss': tf.metrics.mean(features_loss) } return tf.estimator.EstimatorSpec(mode, loss=total_loss, eval_metric_ops=eval_metric_ops) assert mode == tf.estimator.ModeKeys.TRAIN with tf.variable_scope('learning_rate'): global_step = tf.train.get_global_step() learning_rate = tf.train.polynomial_decay( params['initial_learning_rate'], global_step, params['num_steps'], params['end_learning_rate'], power=1.0 # linear decay ) tf.summary.scalar('learning_rate', learning_rate) with tf.variable_scope('optimizer'): optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.999) train_op = optimizer.minimize(total_loss, global_step=global_step) return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
def __init__(self, config=None, training=True, train_form='Both'): # Load vocabulary self.char2idx, self.idx2char = load_vocab() self.graph = tf.Graph() with self.graph.as_default(): if training: self.origx, self.x, self.y1, self.y2, self.y3, self.num_batch = get_batch( config, train_form) self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, self.num_batch), dtype=tf.int32) else: # Evaluation self.x = tf.placeholder(tf.int32, shape=(1, hp.T_x)) self.y1 = tf.placeholder(tf.float32, shape=(1, hp.T_y // hp.r, hp.n_mels * hp.r)) self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=( hp.dec_layers, 1, )) # Get decoder inputs: feed last frames only if train_form != 'Converter': self.decoder_input = tf.concat( (tf.zeros_like(self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1) # Networks if train_form != 'Converter': with tf.variable_scope("encoder"): self.encoded = encoder(self.x, training=training) with tf.variable_scope("decoder"): self.mel_logits, self.done_output, self.max_attentions_li = decoder( self.decoder_input, self.encoded, self.prev_max_attentions_li, training=training) #self.mel_output = self.mel_logits self.mel_output = tf.nn.sigmoid(self.mel_logits) if train_form == 'Both': with tf.variable_scope("converter"): #self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels)) self.converter_input = self.mel_output self.mag_logits = converter(self.converter_input, training=training) self.mag_output = tf.nn.sigmoid(self.mag_logits) elif train_form == 'Converter': with tf.variable_scope("converter"): #self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels)) self.converter_input = self.y1 self.mag_logits = converter(self.converter_input, training=training) self.mag_output = tf.nn.sigmoid(self.mag_logits) self.global_step = tf.Variable(0, name='global_step', trainable=False) if training: # Loss if train_form != 'Converter': self.loss1 = tf.reduce_mean( tf.abs(self.mel_output - self.y1)) if hp.include_dones: self.loss2 = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.done_output, labels=self.y2)) if train_form != 'Encoder': self.loss3 = tf.reduce_mean( tf.abs(self.mag_output - self.y3)) if train_form == 'Both': if hp.include_dones: self.loss = self.loss1 + self.loss2 + self.loss3 else: self.loss = self.loss1 + self.loss3 elif train_form == 'Encoder': if hp.include_dones: self.loss = self.loss1 + self.loss2 else: self.loss = self.loss1 else: self.loss = self.loss3 # Training Scheme self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr) ## gradient clipping self.gvs = self.optimizer.compute_gradients(self.loss) self.clipped = [] for grad, var in self.gvs: grad = grad if grad is None else tf.clip_by_value( grad, -1. * hp.max_grad_val, hp.max_grad_val) grad = grad if grad is None else tf.clip_by_norm( grad, hp.max_grad_norm) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary tf.summary.scalar('loss', self.loss) if train_form != 'Converter': tf.summary.histogram('mel_output', self.mel_output) tf.summary.histogram('mel_actual', self.y1) tf.summary.scalar('loss1', self.loss1) if hp.include_dones: tf.summary.histogram('done_output', self.done_output) tf.summary.histogram('done_actual', self.y2) tf.summary.scalar('loss2', self.loss2) if train_form != 'Encoder': tf.summary.histogram('mag_output', self.mag_output) tf.summary.histogram('mag_actual', self.y3) tf.summary.scalar('loss3', self.loss3) self.merged = tf.summary.merge_all()
z_encoder_sketchy = encoder(in_dim=params.x_dim, z_dim=params.glove_dim) cuda(z_encoder_sketchy) z_encoder_sketchy = train_z_encoder( encoder_model=z_encoder_sketchy, feature_dict=features_sketchy_dict, dump_location=params.path_z_encoder_sketchy) else: z_encoder_sketchy = torch.load(params.path_z_encoder_sketchy) cuda(z_encoder_sketchy) if (not os.path.isfile(params.path_s_encoder_sketchy)): s_encoder_sketchy = encoder(in_dim=params.x_dim, z_dim=params.glove_dim) decoder_sketchy = decoder(params.glove_dim) adv_sketchy = adv_classifier(feat_dim=params.glove_dim, num_classes=params.num_class) cuda(s_encoder_sketchy) cuda(adv_sketchy) cuda(decoder_sketchy) s_encoder_sketchy = train_s_encoder( z_encoder=z_encoder_sketchy, s_encoder=s_encoder_sketchy, decoder=decoder_sketchy, adv_classifier=adv_sketchy, feature_dict=features_sketchy_dict, dump_location=params.path_s_encoder_sketchy) else: s_encoder_sketchy = torch.load(params.path_s_encoder_sketchy)