def __init__(self, lr=0.1, use_locking=False, name="ExponentialMappingOptimizer"): super(ExponentialMappingOptimizer, self).__init__(use_locking, name) self.lr = lr self.euclidean_optimizer = AdamOptimizer()
def _build(self, n_inputs): # Build the input/output variables x = tf.placeholder(shape=[None, n_inputs], name='x', dtype=tf.float32) y = tf.placeholder(shape=[None, 1], name='y', dtype=tf.float32) # Build the model w, phi = self._build_subnets(x) n_hidden = int(w.shape[1]) assert phi.shape[1] == n_hidden, \ 'w(x) and phi(x) have incompatible shapes: {} {}'.format(w.shape, phi.shape) dot = tf.reduce_sum(w * phi, axis=1, keepdims=True, name='dot') f = tf.sigmoid(dot, name='f') z = tf.placeholder(shape=[None, n_hidden], dtype=tf.float32, name='z') # Build the losses loss_y = log_loss(y, f) loss_z = tf.reduce_mean(tf.reduce_sum((z - w) * (z - w), axis=1)) # Build the regularizer # XXX remove bias? grad_f = tf.gradients(f, [x])[0] jacob_phi = batch_jacobian(phi, x) w_times_jacob_phi = tf.einsum('boi,bo->bi', jacob_phi, w) reg_z = tf.reduce_sum(tf.squared_difference(grad_f, w_times_jacob_phi)) # Build the optimizers l0, l1, l2 = 1 - sum(self.lambdas), self.lambdas[0], self.lambdas[1] self.train_op_y = AdamOptimizer(self.eta) \ .minimize(l0 * loss_y + l2 * reg_z) self.train_op_z = AdamOptimizer(self.eta) \ .minimize(l1 * loss_z + l2 * reg_z) self.train_op_y_z = AdamOptimizer(self.eta) \ .minimize(l0 * loss_y + l1 * loss_z + l2 * reg_z) # Build the tensorflow session self.session = tf.Session() self.session.run(tf.global_variables_initializer()) self._saver = tf.train.Saver() self._saver.save(self.session, _CHECKPOINT) self.tf_vars = { 'x': x, 'z': z, 'y': y, 'w': w, 'phi': phi, 'dot': dot, 'f': f, 'loss_y': loss_y, 'loss_z': loss_z, 'reg_z': reg_z, }
def model_fn(features, labels, mode): tf.logging.set_verbosity(tf.logging.WARN) model = hub.Module(IMG_ENCODER, trainable=True) tf.logging.set_verbosity(tf.logging.INFO) model = model(features['x']) regularizer = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) output = tf.layers.dense(model, VEC_SPACE_DIMENSIONS, activation=tf.nn.relu) output = tf.layers.dense(model, VEC_SPACE_DIMENSIONS, activation=tf.nn.relu) output = tf.layers.dense(model, VEC_SPACE_DIMENSIONS, activation=tf.nn.tanh) if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL: loss = mean_squared_error(labels, output) regularizer = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) loss = loss + 0.25 * sum(regularizer) if mode == ModeKeys.TRAIN: train_op = AdamOptimizer(learning_rate=0.00001).minimize( loss=loss, global_step=get_global_step()) return EstimatorSpec(mode=mode, loss=loss, train_op=train_op) elif mode == ModeKeys.EVAL: eval_metric_ops = { 'accuracy': tf.metrics.mean_cosine_distance(labels, output, 0) } return EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) elif mode == ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=output)
def __init__(self, state_size, action_size, lr=0.001): self.init = xavier_initializer() with tf.variable_scope('supervised_policy'): self.st = tf.placeholder(tf.float32, [None, state_size], name='st') self.acts_prob = self.sl_policy_nn(self.st, state_size, action_size, self.init) self.act = tf.placeholder(tf.int32, [None], name='act') act_mask = tf.cast(tf.one_hot(self.act, depth=action_size), tf.bool) self.act_prob = tf.boolean_mask(self.acts_prob, act_mask) self.loss = sum( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='supervised_policy')) + tf.reduce_sum( -tf.log(self.act_prob)) self.optimizer = AdamOptimizer(learning_rate=lr) self.training_op = self.optimizer.minimize(self.loss)
class Grad_policy(object): def __init__(self, state_size, action_size, lr=0.001): self.init = xavier_initializer() with tf.variable_scope('supervised_policy'): self.st = tf.placeholder(tf.float32, [None, state_size], name='st') self.acts_prob = self.sl_policy_nn(self.st, state_size, action_size, self.init) self.act = tf.placeholder(tf.int32, [None], name='act') self.reward = tf.placeholder(tf.float32, name='reward') act_mask = tf.cast(tf.one_hot(self.act, depth=action_size), tf.bool) self.act_prob = tf.boolean_mask(self.acts_prob, act_mask) self.loss = sum( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='supervised_policy')) + tf.reduce_sum( -tf.log(self.act_prob) * self.reward) self.optimizer = AdamOptimizer(learning_rate=lr) self.training_op = self.optimizer.minimize(self.loss) def sl_policy_nn(self, state, state_size, action_size, init): w1 = tf.get_variable('W1', [state_size, 512], initializer=init, regularizer=l2_regularizer(0.01)) b1 = tf.get_variable('b1', [512], initializer=tf.constant_initializer(0.0)) h1 = tf.nn.relu(tf.matmul(state, w1) + b1) w2 = tf.get_variable('w2', [512, 1024], initializer=init, regularizer=l2_regularizer(0.01)) b2 = tf.get_variable('b2', [1024], initializer=tf.constant_initializer(0.0)) h2 = tf.nn.relu(tf.matmul(h1, w2) + b2) w3 = tf.get_variable('w3', [1024, action_size], initializer=init, regularizer=l2_regularizer(0.01)) b3 = tf.get_variable('b3', [action_size], initializer=tf.constant_initializer(0.0)) acts_prob = tf.nn.softmax(tf.matmul(h2, w3) + b3) return acts_prob def get_act_probs(self, st, sess=None): sess = sess or tf.get_default_session() return sess.run(self.acts_prob, {self.st: st}) def train_batch(self, st, act, reward, sess=None): sess = sess or tf.get_default_session() _, loss = sess.run([self.training_op, self.loss], { self.st: st, self.act: act, self.reward: reward }) return loss
def build(self) -> None: self.model = self.model_instances[-1] # Build losses and training operators ratio = tf.exp(self.model.logp - self.act_prob_ph) clipped_ratio = tf.clip_by_value(ratio, 1 - self.clip_range, 1 + self.clip_range) self.pi_loss = -tf.reduce_mean( tf.minimum(ratio * self.advantage_ph, clipped_ratio * self.advantage_ph)) self.v_loss = tf.reduce_mean((self.value_ph - self.model.v)**2) self.approx_kl = tf.reduce_mean(self.act_prob_ph - self.model.logp) self.train_pi = AdamOptimizer(learning_rate=self.pi_lr).minimize( self.pi_loss) self.train_v = AdamOptimizer(learning_rate=self.vf_lr).minimize( self.v_loss) # Initialize variables self.model.sess.run(tf.global_variables_initializer())
def build(self) -> None: self.policy_model = self.model_instances[0] self.target_model = self.model_instances[1] self.loss = tf.reduce_mean( (self.policy_model.values - self.target_ph)**2) self.train_q = AdamOptimizer(learning_rate=self.lr).minimize(self.loss) self.policy_model.sess.run(tf.global_variables_initializer()) self.target_model.sess.run(tf.global_variables_initializer()) self.update_target_model()
def train(): classifier = get_model() opt = AdamOptimizer(1e-5) images_data = get_classification_data("../data/data_classification_train.json") count = 0 print("Training started") shuffle(images_data) for (i, label) in images_data: img = get_img("../pictures/pictures_classification_train/{}.png".format(i)) def get_loss(): img_vector = tf.convert_to_tensor([img], dtype=np.float32) logits = classifier(img_vector) entropy = sparse_softmax_cross_entropy_with_logits(labels=[label], logits=logits) entropy = tf.gather(entropy, 0) save_data(label, logits[0].numpy().tolist(), entropy.numpy().tolist()) return entropy opt.minimize(get_loss) count += 1 if (count % 1000 == 0): classifier.save_weights(weights_path) print("Weights saved") classifier.save_weights(weights_path) print("Weights saved")
def __init__(self, batch_size=100, epochs=50, verbose=1): self.batch_size = batch_size self.epochs = epochs self.verbose = verbose self.num_classes = 2 self.test_data = None self.test_labels = None self.tracker = None self.img_shape = (224, 224, 1) self.model = self.create_model() self.model.compile(optimizer=AdamOptimizer(), loss='binary_crossentropy', metrics=['accuracy'])
def train(_seed, minibatch_size, no_iterations, lrate, show_training_info, net=None, n_data=1e-4, X_test=None, y_test=None, y_train_std=None, y_train_mean=None): if not minibatch_size: minibatch_size = min(1e4, n_data) n_batches = np.ceil(n_data / minibatch_size) n_epochs = int(np.ceil(no_iterations / n_batches)) # train t0 = time.process_time() net.train(AdamOptimizer(lrate), n_epochs, minibatch_size=minibatch_size, X_test=X_test, show_training_info=show_training_info, y_test=y_test, y_train_std=y_train_std, y_train_mean=y_train_mean, log_every=100) t1 = time.process_time() return (t1-t0)
def train_optimizer(self): with tf.variable_scope('train_step'): self.global_step_ = tf.Variable(0, name='global_step_', trainable=False) if self.optimizer_ == 'Adam': opt = AdamOptimizer(learning_rate=self.learning_rate_ph_) elif self.optimizer_ == 'Adagrad': opt = AdagradOptimizer(learning_rate=self.learning_rate_ph_) elif self.optimizer_ == 'Adadelta': opt = AdadeltaOptimizer(learning_rate=self.learning_rate_ph_) elif self.optimizer_ == 'RMSProp': opt = RMSPropOptimizer(learning_rate=self.learning_rate_ph_) elif self.optimizer_ == 'Momentum': opt = MomentumOptimizer(learning_rate=self.learning_rate_ph_, momentum=0.9) else: opt = GradientDescentOptimizer( learning_rate=self.learning_rate_ph_) """ #修正梯度参数的另一种写法 #获取全部可以训练的参数tf_variables tf_variables = tf.trainable_variables() #提前计算梯度 tf_grads = tf.gradients(self.loss_, tf_variables) #由它们的范数之和之比求多个张量的值 tf_grads,_ = tf.clip_by_global_norm(tf_grads, self.clip_grad_) #将前面clip过的梯度应用到可训练的参数上 self.train_optimizer_ = opt.apply_gradients(zip(tf_grads, tf_variables)) """ # 获取参数,提前计算梯度 grads_and_vars = opt.compute_gradients(self.loss_) # 修正梯度值 grads_and_vars_clip = [[ tf.clip_by_value(g, -self.clip_grad_, self.clip_grad_), v ] for g, v in grads_and_vars] # 应用修正后的梯度值 self.train_optimizer_ = opt.apply_gradients( grads_and_vars_clip, global_step=self.global_step_)
def __init__(self, num_states, num_actions): self.memory = ExperienceMemory(CAPACITY) self.num_states, self.num_actions = num_states, num_actions self.batch = None self.state_batch = None self.action_batch = None self.states_next_batch = None self.reward_batch = None self.optimizer = AdamOptimizer() self.main_q_network = Net(num_states, num_actions) self.main_q_network.compile(loss=huber_loss, optimizer=self.optimizer) self.target_q_network = Net(num_states, num_actions) self.target_q_network.compile(loss=huber_loss, optimizer=self.optimizer)
def load_model(self, filename='model'): """ Loads the model and trained weights created by train_model.py """ with open(filename + '.json', 'r') as json_file: loaded_model_json = json_file.read() loaded_model = model_from_json(loaded_model_json) loaded_model.load_weights(filename + '.h5') print('Loaded model') loaded_model.compile(optimizer=AdamOptimizer(), loss='binary_crossentropy', metrics=['accuracy']) print(loaded_model.summary()) return loaded_model
def model(self, input_shape, label_shape, opt, lr=1e-4, training=True): '''define how to build model''' ##TODO:定义网络,使用"Ctrl + 点击函数名"查看函数## self.set_inputs(input_shape, label_shape) self.create_encoder() self.create_decode_head() if not training: self.reshape_output() self.define_loss() ##TODO:选择优化器, 优化器参数完善## if opt == 'adam': self.optimzer = AdamOptimizer(learning_rate=lr) elif opt == 'sgd': self.optimzer = GradientDescentOptimizer(learning_rate=lr) ##TODO:一次训练迭代操作## self.train_op = self.optimzer.minimize(self.loss, global_step=self.global_step)
def train(_seed, minibatch_size, no_iterations, lrate, show_training_info, net=None, n_data=1e-4): if not minibatch_size: minibatch_size = min(1e4, n_data) n_batches = np.ceil(n_data / minibatch_size) n_epochs = int(np.ceil(no_iterations / n_batches)) # train t0 = time.process_time() net.train(AdamOptimizer(lrate), n_epochs, minibatch_size=minibatch_size, show_training_info=show_training_info) t1 = time.process_time() return (t1 - t0)
def fit_model(X_train, Y_train, model, checkpoint_dir, imgtup): imgname, imgfunc = imgtup chk = os.listdir(checkpoint_dir) if len(chk) > 1: # latest = tf.train.latest_checkpoint(checkpoint_dir) # model.load_weights(latest) pass else: datagen = ImageDataGenerator( preprocessing_function=imgfunc) # Transform all training images datagen.fit(X_train) # Compile model learning_rate = 1e-3 opt = AdamOptimizer(learning_rate=learning_rate) model.compile(optimizer=opt, loss=mean_absolute_error, metrics=['accuracy']) model.summary() # Fit model history = model.fit_generator(datagen.flow(X_train,Y_train, batch_size=32), steps_per_epoch=X_train.shape[0] / 32, epochs=100) plot_loss('review/train_val_loss_021_{}.png'.format(imgname), history) return model
#Y_test = X_test X_train = X_train[0:m, ...] Y_train = X_train[0:m, ...] X_test = X_test[0:m, ...] Y_test = X_test[0:m, ...] logger.debug("X_train default shape: {}".format(X_train.shape)) logger.debug("Y_train default shape: {}".format(Y_train.shape)) # Compiling model using Keras learning_rate = 1e-3 model = simple_sony() #model = full_sony() #opt = Adam(lr=1e-4) opt = AdamOptimizer(learning_rate=learning_rate) model.compile(optimizer=opt, loss=mean_absolute_error, metrics=['accuracy']) # Fitting the model history = model.fit(X_train, Y_train, validation_split=0.25, epochs=100, batch_size=32, callbacks=[cp_callback]) plot_loss('review/train_val_loss.png', history) # Predicting with the model
from layers.capsule_max_pool import CapsMaxPool from tensorflow.keras.models import Model from tensorflow.keras.layers import Input from tensorflow.train import AdamOptimizer import numpy as np import tensorflow as tf tf.enable_eager_execution() shape = (1, 2, 2, 2, 3) x = Input(shape=shape[1:]) maxpool = CapsMaxPool()(x) model = Model(inputs=x, outputs=maxpool) input_x = np.array([[ [[[1, 2, 3], [1, 2, 3]], [[4, 5, 6], [4, 5, 6]]], [[[7, 8, 9], [7, 8, 9]], [[10, 11, 12], [10, 11, 12]]], ]], dtype=np.float32) input_y = np.array([[[ [[10, 11, 12], [10, 11, 12]], ]]], dtype=np.float32) tensor_x = tf.cast(input_x, dtype=tf.float32) tensor_y = tf.cast(input_y, dtype=tf.float32) opt = AdamOptimizer() model.compile(optimizer=opt, loss='mean_squared_error') print(model.fit(x=input_x, y=input_y, batch_size=1)) model.compile(optimizer=opt, loss='mean_squared_error')
tf.reset_default_graph() X_data = tf.placeholder(tf.float32, shape=[None, x_vals.shape[1]]) y_target = tf.placeholder(tf.float32, shape=[None, 1]) W = tf.get_variable(shape=[x_vals.shape[1], 1], name="W", initializer=xavier_initializer()) b = tf.get_variable(shape=[1, 1], name="b", initializer=xavier_initializer()) output = tf.matmul(X_data, W) - b l2_norm = mean_squared_error(output, y_target) # - # $$ Loss = \max(0, 1 - \hat{y(i)} \cdot y(i)) + \alpha ||X \cdot W - b||^2 $$ loss = tf.reduce_mean(tf.maximum(0., 1. - output * y_target)) + 0.01 * l2_norm optimizer = AdamOptimizer(0.01).minimize(loss) # + batch_size = 1024 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(20000): rand_index = np.random.choice(len(X_train), size=batch_size) rand_x = X_train[rand_index] rand_y = np.transpose([y_train[rand_index]]) sess.run(optimizer, feed_dict={X_data: rand_x, y_target: rand_y}) [[a1], [a2]] = sess.run(W) [[b]] = sess.run(b) # -
noisyLabels.append(labels) ########################## ## BUILD the Neural net ## ########################## model = [] for netNum in range(0, 2): #for each net with hidden neuron numbers 5, 15, ... 25 model.append(keras.Sequential()) for i in range(0, len(options.layerSizes)): #add layers to the net model[netNum].add( keras.layers.Dense(options.layerSizes[i], activation="sigmoid", kernel_initializer=options.initializer)) model[netNum].compile(optimizer=AdamOptimizer(0.001), loss='categorical_crossentropy', metrics=['accuracy']) # ### TRAIN ### #net trained on only clean model[0].fit(cleanData, labels, epochs=500, batch_size=5, verbose=1) # net trained on both types for trainingCycles in range(0, 500): if (trainingCycles % 10 == 0): print("Training noise set, cycle: ", trainingCycles, "/500") noisyData = [] for i in range(0, 7): # noise levels {0.0, 0.5, 1.0 ... 3.0} noisyData.append(helper.makeNoisy(cleanData, i / 2)) for i in range(0, 7): ## for all noisy and clean data
def build_headnet(N, features, embedding_dim, num_negative_samples, num_hidden=128, identity_variance=False): if features is not None: # HEADNet with attributes print("training using node attributes") input_layer = Input((features.shape[1], ), name="attributed_input_layer") input_transform = Dense( num_hidden, # activation="relu", # kernel_initializer=initializer, kernel_regularizer=regularizers.l2(reg), bias_regularizer=regularizers.l2(reg), name="euclidean_transform", )(input_layer) else: print("training without using attributes") input_layer = Input((1, ), name="unattributed_input_layer") input_transform = Embedding(N, num_hidden)(input_layer) input_transform = Activation("relu")(input_transform) hyperboloid_embedding_layer = Dense( embedding_dim, # kernel_initializer=initializer, kernel_regularizer=regularizers.l2(reg), bias_regularizer=regularizers.l2(reg), name="dense_to_hyperboloid", )(input_transform) to_hyperboloid = Lambda(exp_map_0, name="to_hyperboloid")(hyperboloid_embedding_layer) sigma_layer = Dense( embedding_dim, activation=lambda x: K.elu(x) + 1., kernel_initializer="zeros", kernel_regularizer=regularizers.l2(reg), bias_regularizer=regularizers.l2(reg), name="dense_to_sigma", trainable=not identity_variance, )(input_transform) if identity_variance: sigma_layer = Lambda(K.stop_gradient, name="variance_stop_gradient")(sigma_layer) embedder_model = Model(input_layer, [to_hyperboloid, sigma_layer], name="embedder_model") if features is not None: trainable_input = Input(( 1 + num_negative_samples, 2, features.shape[1], ), name="trainable_input_attributed") else: trainable_input = Input(( 1 + num_negative_samples, 2, ), name="trainable_input_non_attributed") mus, sigmas = embedder_model(trainable_input) assert len(mus.shape) == len(sigmas.shape) == 4 mus = Lambda(map_to_tangent_space_mu_zero, name="to_tangent_space_mu_zero")(mus) kds = Lambda(kullback_leibler_divergence, name="kullback_leibler_layer")([mus, sigmas]) trainable_model = Model(trainable_input, kds, name="trainable_model") optimizer = AdamOptimizer(1e-3, ) trainable_model.compile(optimizer=optimizer, loss=asym_hyperbolic_loss, target_tensors=[ tf.placeholder(dtype=tf.int64, shape=(None, 1)), ]) return embedder_model, trainable_model
def build(self, word_length, num_labels, num_intent_labels, word_vocab_size, char_vocab_size, word_emb_dims=100, char_emb_dims=30, char_lstm_dims=30, tagger_lstm_dims=100, dropout=0.2): self.word_length = word_length self.num_labels = num_labels self.num_intent_labels = num_intent_labels self.word_vocab_size = word_vocab_size self.char_vocab_size = char_vocab_size words_input = Input(shape=(None, ), name='words_input') embedding_layer = Embedding(word_vocab_size, word_emb_dims, name='word_embedding') word_embeddings = embedding_layer(words_input) word_embeddings = Dropout(dropout)(word_embeddings) word_chars_input = Input(shape=(None, word_length), name='word_chars_input') char_embedding_layer = Embedding(char_vocab_size, char_emb_dims, input_length=word_length, name='char_embedding') char_embeddings = char_embedding_layer(word_chars_input) char_embeddings = TimeDistributed(Bidirectional( LSTM(char_lstm_dims)))(char_embeddings) char_embeddings = Dropout(dropout)(char_embeddings) # first BiLSTM layer (used for intent classification) first_bilstm_layer = Bidirectional( LSTM(tagger_lstm_dims, return_sequences=True, return_state=True)) first_lstm_out = first_bilstm_layer(word_embeddings) lstm_y_sequence = first_lstm_out[:1][ 0] # save y states of the LSTM layer states = first_lstm_out[1:] hf, _, hb, _ = states # extract last hidden states h_state = concatenate([hf, hb], axis=-1) intents = Dense(num_intent_labels, activation='softmax', name='intent_classifier_output')(h_state) # create the 2nd feature vectors combined_features = concatenate([lstm_y_sequence, char_embeddings], axis=-1) # 2nd BiLSTM layer (used for entity/slots classification) second_bilstm_layer = Bidirectional( LSTM(tagger_lstm_dims, return_sequences=True))(combined_features) second_bilstm_layer = Dropout(dropout)(second_bilstm_layer) bilstm_out = Dense(num_labels)(second_bilstm_layer) # feed BiLSTM vectors into CRF crf = CRF(num_labels, name='intent_slot_crf') entities = crf(bilstm_out) model = Model(inputs=[words_input, word_chars_input], outputs=[intents, entities]) loss_f = { 'intent_classifier_output': 'categorical_crossentropy', 'intent_slot_crf': crf.loss } metrics = { 'intent_classifier_output': 'categorical_accuracy', 'intent_slot_crf': crf.viterbi_accuracy } model.compile(loss=loss_f, optimizer=AdamOptimizer(), metrics=metrics) self.model = model
(x0**3) - 60 * (x0**2) - 4 * x0 + 6 # # Gradient Descent # # $$ f(x)=x^3-60x^2-4x+6 $$ import tensorflow as tf from tensorflow.train import AdamOptimizer start = time() x = tf.get_variable('x', initializer=tf.constant(100.0)) y = x * x * x - 60 * x * x - 4 * x + 6 # + optimizer = AdamOptimizer(learning_rate=1e-2).minimize(y) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for _ in range(50000): sess.run(optimizer) print(sess.run(x)) # - print("Gradient Descent for y is about : %0.2f seconds" % (time() - start)) # # Binary Search # # similiar to Newton method from [0, 100]. Somehow simple, just skip it # # MCTS
def neural_transfer(content_image, style_image, output_dirpath, epochs=1000, epoch_length=100, alpha=1, beta=10): """ Main function to execute neural transfer algorithm using tensorflow eager execution """ tf.enable_eager_execution() optimizer = AdamOptimizer(learning_rate=0.003) # Layers for loss calculations content_layers = ['block4_conv2'] style_layers = [ 'block1_conv1', 'block2_conv1', 'block3_conv1', 'block4_conv1', 'block5_conv1' ] model = init_model(content_layers, style_layers) # Get target featuremaps tensors from the content and style images content_featuremaps = model(np.expand_dims(content_image, axis=0))[:len(content_layers)] style_featuremaps = model(np.expand_dims(style_image, axis=0))[len(content_layers):] # Starting point of combination image image_zero = np.expand_dims(np.random.random(np.shape(content_image)), axis=0) combined_image_tensor = tf.Variable(image_zero, name='combined_image_tensor', dtype=tf.float32) for epoch in range(epochs): print('\nEpoch: ', epoch) # Convert tensor to array then save image to output directory for viewing combined_image = np.squeeze(combined_image_tensor.numpy(), axis=0) output_filepath = os.path.join(output_dirpath, 'epoch_{}.png'.format(epoch)) cv2.imwrite(output_filepath, combined_image * 255) content_losses_array_avg = np.zeros(len(content_layers), dtype=np.float32) style_losses_array_avg = np.zeros(len(style_layers), dtype=np.float32) for _ in tqdm(range(epoch_length)): # Operations here are recorded to "GradientTape" for backpropagation with tf.GradientTape() as tape: combination_featuremaps = model(combined_image_tensor) total_loss, content_losses, style_losses = calc_total_loss( content_featuremaps, style_featuremaps, combination_featuremaps, alpha, beta) gradients = tape.gradient(total_loss, combined_image_tensor) optimizer.apply_gradients([[gradients, combined_image_tensor]]) # Ensure output image/tensor is bounded between 0 and 1 clipped = tf.clip_by_value(combined_image_tensor, clip_value_min=0, clip_value_max=1) combined_image_tensor.assign(clipped) # Record the average losses for the epoch content_losses_array_avg += content_losses / epoch_length style_losses_array_avg += style_losses / epoch_length # Display individual losses for analysis print('Content loss: ', content_losses_array_avg) print('Style loss: ', style_losses_array_avg) print( 'Total loss: ', np.sum(style_losses_array_avg) + np.sum(content_losses_array_avg))
num_classes = 10 batch_size = 32 epochs = 10 (x_train, y_train), (x_test, y_test) = load_data() x_train = x_train.reshape(60000, 784) / 255 x_test = x_test.reshape(10000, 784) / 255 y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) train_ds = Dataset.from_tensor_slices( (x_train, y_train)).shuffle(60000).batch(batch_size) test_ds = Dataset.from_tensor_slices( (x_test, y_test)).shuffle(10000).batch(batch_size) optimizer = AdamOptimizer() model = Net() train(model, train_ds, epochs=2) test(model, test_ds) class_name = [str(i) for i in range(num_classes)] x, y = iter(test_ds).next() pred = predict(model, x, class_name) plt.imshow(x[0].numpy().reshape(28, 28)) plt.title(pred[0]) plt.show()
def train(dataset_path,checkpoint_path, logdir, batch_size, epochs): ''' Load the data ''' tf.enable_eager_execution() dataset = load_audionet_dataset(dataset_path) def make_tuple(record): return (tf.reshape(record['data'],(8000,)),tf.reshape(record['data'],(8000,))) ''' Split the dataset ''' train_dataset = dataset.filter(split('digit', 'train')) \ .map(make_tuple) \ .shuffle(18000, seed=42) \ .batch(batch_size) \ .repeat() train_nb_samples = len(splits['digit']['train'][0])*500 test_dataset = dataset.filter(split('digit', 'test')) \ .map(make_tuple) \ .shuffle(10000, seed=42) \ .batch(batch_size) test_nb_samples = len(splits['digit']['test'][0])*500 ''' Neural Net model ''' x = Input(shape=(8000,)) latent_dim = 500 intermediate_dim = 2000 original_dim = 8000 h = Dense(intermediate_dim, activation='relu')(x) z_mean = Dense(latent_dim, activation='linear')(h) z_log_sigma = Dense(latent_dim, activation='linear', \ kernel_initializer='zeros', \ bias_initializer='zeros')(h) def sampling(args): z_mean, z_log_sigma = args epsilon = K.random_normal(shape=(batch_size, latent_dim)) return z_mean + K.exp(z_log_sigma/2) * epsilon z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_sigma]) decoder_h = Dense(intermediate_dim, activation='relu') decoder_mean = Dense(original_dim, activation='sigmoid') h_decoded = decoder_h(z) x_decoded_mean = decoder_mean(h_decoded) # end-to-end autoencoder vae = Model(x, x_decoded_mean) # encoder, from inputs to latent space encoder = Model(x, z_mean) # generator, from latent space to reconstructed inputs decoder_input = Input(shape=(latent_dim,)) _h_decoded = decoder_h(decoder_input) _x_decoded_mean = decoder_mean(_h_decoded) generator = Model(decoder_input, _x_decoded_mean) def vae_loss(x, x_decoded_mean): xent_loss = K.mean(K.binary_crossentropy(x, x_decoded_mean),axis=1) kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma), axis=1) return xent_loss + kl_loss adam = AdamOptimizer(learning_rate=0.001) vae.compile(optimizer=adam, loss=vae_loss,metrics = ['accuracy']) ''' Callbacks ''' if not os.path.isdir(logdir): os.mkdir(logdir) tb_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, batch_size=batch_size) if not os.path.isdir(checkpoint_path): os.mkdir(checkpoint_path) checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(os.path.join(checkpoint_path, "model.{epoch:02d}-{val_acc:.2f}"), save_weights_only=True) gc_callback = tf.keras.callbacks.LambdaCallback(on_batch_end=lambda batch,_: gc.collect()) ''' Fit the model ''' vae.fit(train_dataset, \ epochs=epochs, \ steps_per_epoch = math.ceil(train_nb_samples/batch_size), \ batch_size = batch_size, \ shuffle=True, \ validation_data=test_dataset, \ validation_steps=math.ceil(test_nb_samples/batch_size), \ callbacks = [tb_callback, checkpoint_callback])
class ExponentialMappingOptimizer(optimizer.Optimizer): def __init__(self, lr=0.1, use_locking=False, name="ExponentialMappingOptimizer"): super(ExponentialMappingOptimizer, self).__init__(use_locking, name) self.lr = lr self.euclidean_optimizer = AdamOptimizer() def _apply_dense(self, grad, var): assert False spacial_grad = grad[..., :-1] t_grad = -1 * grad[..., -1:] ambient_grad = tf.concat([spacial_grad, t_grad], axis=-1) tangent_grad = project_onto_tangent_space(var, ambient_grad) exp_map = exponential_mapping(var, -self.lr * tangent_grad) return tf.assign(var, exp_map) def _apply_sparse(self, grad, var): if "hyperbolic" in var.name: indices = grad.indices values = grad.values p = tf.gather(var, indices, name="gather_apply_sparse") spacial_grad = values[..., :-1] t_grad = -1 * values[..., -1:] ambient_grad = K.concatenate(\ [spacial_grad, t_grad], axis=-1, ) tangent_grad = project_onto_tangent_space(p, ambient_grad) exp_map = exponential_mapping(p, -self.lr * tangent_grad) return tf.scatter_update(ref=var, indices=indices, updates=exp_map, name="scatter_update") else: # euclidean update using Adam optimizer return self.euclidean_optimizer.apply_gradients([ (grad, var), ]) # class MyAdamOptimizer(optimizer.Optimizer): # """Optimizer that implements the Adam algorithm. # See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) # ([pdf](http://arxiv.org/pdf/1412.6980.pdf)). # """ # def __init__(self, # learning_rate=1e-3, # beta1=0.9, # beta2=0.999, # epsilon=1e-8, # use_locking=False, # name="Adam"): # r"""Construct a new Adam optimizer. # Initialization: # $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$ # $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$ # $$t := 0 \text{(Initialize timestep)}$$ # The update rule for `variable` with gradient `g` uses an optimization # described at the end of section 2 of the paper: # $$t := t + 1$$ # $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$ # $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ # $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ # $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ # The default value of 1e-8 for epsilon might not be a good default in # general. For example, when training an Inception network on ImageNet a # current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the # formulation just before Section 2.1 of the Kingma and Ba paper rather than # the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon # hat" in the paper. # The sparse implementation of this algorithm (used when the gradient is an # IndexedSlices object, typically because of `tf.gather` or an embedding # lookup in the forward pass) does apply momentum to variable slices even if # they were not used in the forward pass (meaning they have a gradient equal # to zero). Momentum decay (beta1) is also applied to the entire momentum # accumulator. This means that the sparse behavior is equivalent to the dense # behavior (in contrast to some momentum implementations which ignore momentum # unless a variable slice was actually used). # Args: # learning_rate: A Tensor or a floating point value. The learning rate. # beta1: A float value or a constant float tensor. The exponential decay # rate for the 1st moment estimates. # beta2: A float value or a constant float tensor. The exponential decay # rate for the 2nd moment estimates. # epsilon: A small constant for numerical stability. This epsilon is # "epsilon hat" in the Kingma and Ba paper (in the formula just before # Section 2.1), not the epsilon in Algorithm 1 of the paper. # use_locking: If True use locks for update operations. # name: Optional name for the operations created when applying gradients. # Defaults to "Adam". @compatibility(eager) When eager execution is # enabled, `learning_rate`, `beta1`, `beta2`, and `epsilon` can each be a # callable that takes no arguments and returns the actual value to use. # This can be useful for changing these values across different # invocations of optimizer functions. @end_compatibility # """ # super(MyAdamOptimizer, self).__init__(use_locking, name) # self._lr = learning_rate # self._beta1 = beta1 # self._beta2 = beta2 # self._epsilon = epsilon # # Tensor versions of the constructor arguments, created in _prepare(). # self._lr_t = None # self._beta1_t = None # self._beta2_t = None # self._epsilon_t = None # def _get_beta_accumulators(self): # with ops.init_scope(): # if context.executing_eagerly(): # graph = None # else: # graph = ops.get_default_graph() # return (self._get_non_slot_variable("beta1_power", graph=graph), # self._get_non_slot_variable("beta2_power", graph=graph)) # def _create_slots(self, var_list): # # Create the beta1 and beta2 accumulators on the same device as the first # # variable. Sort the var_list to make sure this device is consistent across # # workers (these need to go on the same PS, otherwise some updates are # # silently ignored). # first_var = min(var_list, key=lambda x: x.name) # self._create_non_slot_variable( # initial_value=self._beta1, name="beta1_power", colocate_with=first_var) # self._create_non_slot_variable( # initial_value=self._beta2, name="beta2_power", colocate_with=first_var) # # Create slots for the first and second moments. # for v in var_list: # self._zeros_slot(v, "m", self._name) # self._zeros_slot(v, "v", self._name) # def _prepare(self): # lr = self._call_if_callable(self._lr) # beta1 = self._call_if_callable(self._beta1) # beta2 = self._call_if_callable(self._beta2) # epsilon = self._call_if_callable(self._epsilon) # self._lr_t = ops.convert_to_tensor(lr, name="learning_rate") # self._beta1_t = ops.convert_to_tensor(beta1, name="beta1") # self._beta2_t = ops.convert_to_tensor(beta2, name="beta2") # self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon") # def _apply_dense(self, grad, var): # assert False # m = self.get_slot(var, "m") # v = self.get_slot(var, "v") # beta1_power, beta2_power = self._get_beta_accumulators() # return training_ops.apply_adam( # var, # m, # v, # math_ops.cast(beta1_power, var.dtype.base_dtype), # math_ops.cast(beta2_power, var.dtype.base_dtype), # math_ops.cast(self._lr_t, var.dtype.base_dtype), # math_ops.cast(self._beta1_t, var.dtype.base_dtype), # math_ops.cast(self._beta2_t, var.dtype.base_dtype), # math_ops.cast(self._epsilon_t, var.dtype.base_dtype), # grad, # use_locking=self._use_locking).op # def _resource_apply_dense(self, grad, var): # assert False # m = self.get_slot(var, "m") # v = self.get_slot(var, "v") # beta1_power, beta2_power = self._get_beta_accumulators() # return training_ops.resource_apply_adam( # var.handle, # m.handle, # v.handle, # math_ops.cast(beta1_power, grad.dtype.base_dtype), # math_ops.cast(beta2_power, grad.dtype.base_dtype), # math_ops.cast(self._lr_t, grad.dtype.base_dtype), # math_ops.cast(self._beta1_t, grad.dtype.base_dtype), # math_ops.cast(self._beta2_t, grad.dtype.base_dtype), # math_ops.cast(self._epsilon_t, grad.dtype.base_dtype), # grad, # use_locking=self._use_locking) # def _apply_sparse_shared(self, grad, var, indices, # scatter_add): # beta1_power, beta2_power = self._get_beta_accumulators() # beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) # beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) # lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) # beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) # beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) # epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) # lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # grad = tf.verify_tensor_all_finite(grad, "fail in grad") # # if "hyperbolic" in var.name: # # grad = K.concatenate([grad[:,:-1], -grad[:,-1:]], # # axis=-1) # # m_t = beta1 * m + (1 - beta1) * g_t # m = self.get_slot(var, "m") # m_scaled_g_values = grad * (1 - beta1_t) # m_t = state_ops.assign(m, m * beta1_t, # use_locking=self._use_locking) # with ops.control_dependencies([m_t]): # m_t = scatter_add(m, indices, m_scaled_g_values) # # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) # v = self.get_slot(var, "v") # v_scaled_g_values = (grad * grad) * (1 - beta2_t) # v_t = state_ops.assign(v, v * beta2_t, # use_locking=self._use_locking) # with ops.control_dependencies([v_t]): # v_t = scatter_add(v, indices, v_scaled_g_values) # v_sqrt = math_ops.sqrt(K.maximum(v_t, 0.)) # if "hyperbolic" in var.name: # m_t = tf.verify_tensor_all_finite(m_t, "fail in m_t") # v_sqrt = tf.verify_tensor_all_finite(v_sqrt, # "fail in v_sqrt") # gr = m_t / (v_sqrt + epsilon_t) # gr = tf.verify_tensor_all_finite(gr, "fail in gr") # gr = K.concatenate( # [gr[...,:-1], -gr[...,-1:]], # axis=-1) # gr_tangent = project_onto_tangent_space(var, gr) # gr_tangent = tf.verify_tensor_all_finite(gr_tangent, # "fail in tangent") # exp_map = exponential_mapping(var, -lr * gr_tangent) # exp_map = tf.verify_tensor_all_finite(exp_map, # "fail in exp_map") # var_update = state_ops.assign( # var, # exp_map, # use_locking=self._use_locking) # else: # var_update = state_ops.assign_sub( # var, # lr * m_t / (v_sqrt + epsilon_t), # use_locking=self._use_locking) # return control_flow_ops.group(*[var_update, m_t, v_t]) # def _apply_sparse(self, grad, var): # return self._apply_sparse_shared( # grad.values, # var, # grad.indices, # lambda x, i, v: state_ops.scatter_add( # x, # i, # v, # use_locking=self._use_locking)) # def _resource_scatter_add(self, x, i, v): # with ops.control_dependencies( # [resource_variable_ops.resource_scatter_add(x.handle, i, v)]): # return x.value() # def _resource_apply_sparse(self, grad, var, indices): # return self._apply_sparse_shared(grad, var, indices, # self._resource_scatter_add) # def _finish(self, update_ops, name_scope): # # Update the power accumulators. # with ops.control_dependencies(update_ops): # beta1_power, beta2_power = self._get_beta_accumulators() # with ops.colocate_with(beta1_power): # update_beta1 = beta1_power.assign( # beta1_power * self._beta1_t, use_locking=self._use_locking) # update_beta2 = beta2_power.assign( # beta2_power * self._beta2_t, use_locking=self._use_locking) # return control_flow_ops.group( # *update_ops + [update_beta1, update_beta2], name=name_scope)
likelihoods = [Param(0.01, transforms.positive, name="gaussian_noise_{}".format(n)) for n in range(N)] model = LKM(data, additive_kernels, likelihoods) gp_train_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="gp_hyperparameters") ibp_train_vars = list(set(tf.global_variables()) - set(gp_train_vars)) update_tau = model.closed_form_update_tau() elbo = model.build_marginal_loglikelihood() z, nll_gp_refined = model.refine() t_test, K, K_star, K_star_star, noise = model.prepare_for_postprocess() # train IBP parameters with Adam adam = AdamOptimizer(0.01) # train_ibp = adam.minimize(-elbo, var_list=ibp_train_vars) train_ibp = adam.minimize(-elbo, var_list=ibp_train_vars) train_gp = ScipyOptimizerInterface(-elbo, var_list=gp_train_vars, method='L-BFGS-B', options={"maxiter": 10}) # refined train train_gp_refine = ScipyOptimizerInterface(nll_gp_refined, var_list=gp_train_vars, method='L-BFGS-B', options={"maxiter": 300} )