def create_graph(self): create_placeholders(self) create_discriminator(self, size=1) if (self.model_params.model_type == "SS") or (self.model_params.model_type == "BCE"): self.d_loss2 = -discriminator_adversarial_loss(self) self.disc_optimizer_adv, self.adv_grad = LazyAdamOptimizer( 1.5e-3, beta1=0.8, beta2=0.9, epsilon=1e-5), tf.gradients(self.d_loss2, self.d_weights) self.d_train_adversarial = self.disc_optimizer_adv.minimize( self.d_loss2, var_list=self.d_weights) lr = 1e-3 global_step = tf.Variable(0, trainable=False) rate = tf.train.exponential_decay(lr, global_step, 3, 0.9999) self.disc_optimizer = LazyAdamOptimizer(lr, beta1=0.8, beta2=0.9, epsilon=1e-5) self.d_baseline = self.disc_optimizer.minimize(self.d_loss1, var_list=self.d_weights, global_step=global_step) self.d_softmax, self.d_mle = self.disc_optimizer.minimize( self.softmax_loss, var_list=self.d_weights, global_step=global_step), self.disc_optimizer.minimize( -self.mle_lossD, var_list=self.d_weights, global_step=global_step) self.softmax_grad = tf.gradients(self.softmax_loss, self.d_weights)
def get_optimizer( network_config, default_optimizer=train.AdadeltaOptimizer(learning_rate=1.0)): """ Return the optimizer given by the input network configuration, or a default optimizer. :param network_config: network configuration :param default_optimizer: default optimization algorithm :return: configured optimizer """ try: optimizer = network_config.optimizer except KeyError: logging.info("Using Adadelta as default optimizer.") return default_optimizer if isinstance(optimizer.lr, numbers.Number): lr = optimizer.lr else: optimizer.lr.num_train_steps = network_config.max_steps optimizer.lr.steps_per_epoch = network_config.steps_per_epoch lr = get_learning_rate(optimizer.lr, train.get_global_step()) name = optimizer.name params = optimizer.params if "Adadelta" == name: opt = train.AdadeltaOptimizer(lr, **params) elif "Adam" == name: opt = train.AdamOptimizer(lr, **params) elif "LazyAdam" == name: opt = LazyAdamOptimizer(lr, **params) elif "LazyNadam" == name: opt = LazyNadamOptimizer(lr, **params) elif "SGD" == name: opt = train.GradientDescentOptimizer(lr) elif "Momentum" == name: opt = train.MomentumOptimizer(lr, **params) elif "Nadam" == name: opt = NadamOptimizerSparse(lr, **params) elif "bert" == name: opt = AdamWeightDecayOptimizer( lr, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) else: raise ValueError("Invalid optimizer name: {}".format(name)) return opt
def get_opt(name, learning_rate, decay_steps=None): if name == 'momentum': optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9) elif name == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate, beta2=0.98, epsilon=1e-9) elif name == 'sgd': optimizer = tf.train.GradientDescentOptimizer(learning_rate) elif name == 'rms': optimizer = tf.train.RMSPropOptimizer(learning_rate) elif name == 'adagrad': optimizer = tf.train.AdagradOptimizer(learning_rate) elif name == 'lazyadam': optimizer = LazyAdamOptimizer(learning_rate) elif name == 'powersign': optimizer = PowerSignOptimizer(learning_rate) elif name == 'powersign-ld': optimizer = PowerSignOptimizer( learning_rate, sign_decay_fn=get_linear_decay_fn(decay_steps)) elif name == 'powersign-cd': optimizer = PowerSignOptimizer( learning_rate, sign_decay_fn=get_cosine_decay_fn(decay_steps)) elif name == 'powersign-rd': optimizer = PowerSignOptimizer( learning_rate, sign_decay_fn=get_restart_decay_fn(decay_steps)) elif name == 'addsign': optimizer = AddSignOptimizer(learning_rate) elif name == 'addsign-ld': optimizer = AddSignOptimizer( learning_rate, sign_decay_fn=get_linear_decay_fn(decay_steps)) elif name == 'addsign-cd': optimizer = AddSignOptimizer( learning_rate, sign_decay_fn=get_cosine_decay_fn(decay_steps)) elif name == 'addsign-rd': optimizer = AddSignOptimizer( learning_rate, sign_decay_fn=get_restart_decay_fn(decay_steps)) else: optimizer = None return optimizer
imgs, filenames = prepare_imgs('test_data_4', 'test_data_4_25') model_name = 'inception_v3' beta = 1 learning_rate = 0.01 iteration_num = 50000 checkpoint_iter = 50000 sess, graph, mask_var, sig_mask_op, masked_input = build_masking_graph( model_name, 4) # list_tensors() cost_op, last_feat_map_op, loss_terms = masking_graph_cost(sig_mask_op) optimizer = LazyAdamOptimizer(learning_rate) opt_op = optimizer.minimize(cost_op, var_list=[mask_var]) iter = Iteration(max=iteration_num, log=50, checkpoint=checkpoint_iter) # tensorboard # loss_terms_placeholder = tf.placeholder(tf.float32) # tf.summary.scalar('loss_terms', loss_terms_placeholder) # writers = tensorboard_writers(experiment_file.save_directory, loss_terms) # merged_summary = tf.summary.merge_all() # AM_LOSS_THRESHOLD = 1 MASK_CONVERGENCE_THRESHOLD = 10
class Self_Basket_Completion_Model(object): def __init__(self, model): self.model_params = model self.train_data, self.test_data, self.X_train, self.Y_train, self.X_test, self.Y_test = list( ), list(), list(), list(), list(), list() self.LSTM_labels_train, self.LSTM_labels_test = list(), list() self.index, self.index_words = 0, 0 self.neg_sampled = model.neg_sampled self.neg_sampled_pretraining = 1 if self.neg_sampled < 1 else self.neg_sampled self.training_data, self.test_data = model.training_data, model.test_data self.num_epochs, self.batch_size, self.vocabulary_size, self.vocabulary_size2 = model.epoch, model.batch_size, model.vocabulary_size, model.vocabulary_size2 self.seq_length, self.epoch = model.seq_length, model.epoch self.embedding_size, self.embedding_matrix, self.use_pretrained_embeddings = model.embedding_size, model.embedding_matrix, model.use_pretrained_embeddings self.adv_generator_loss, self.adv_discriminator_loss = model.adv_generator_loss, model.adv_discriminator_loss self.negD = model.negD self.discriminator_type = model.D_type self.one_guy_sample = np.random.choice(self.vocabulary_size - 1) self.dataD = [list(), list(), list(), list(), list(), list(), list()] self.Gen_loss1, self.Gen_loss2, self.Disc_loss1, self.Disc_loss2, self.pic_number = 0, 0, 0, 0, 0 def create_graph(self): create_placeholders(self) create_discriminator(self, size=1) if (self.model_params.model_type == "SS") or (self.model_params.model_type == "BCE"): self.d_loss2 = -discriminator_adversarial_loss(self) self.disc_optimizer_adv, self.adv_grad = LazyAdamOptimizer( 1.5e-3, beta1=0.8, beta2=0.9, epsilon=1e-5), tf.gradients(self.d_loss2, self.d_weights) self.d_train_adversarial = self.disc_optimizer_adv.minimize( self.d_loss2, var_list=self.d_weights) lr = 1e-3 global_step = tf.Variable(0, trainable=False) rate = tf.train.exponential_decay(lr, global_step, 3, 0.9999) self.disc_optimizer = LazyAdamOptimizer(lr, beta1=0.8, beta2=0.9, epsilon=1e-5) self.d_baseline = self.disc_optimizer.minimize(self.d_loss1, var_list=self.d_weights, global_step=global_step) self.d_softmax, self.d_mle = self.disc_optimizer.minimize( self.softmax_loss, var_list=self.d_weights, global_step=global_step), self.disc_optimizer.minimize( -self.mle_lossD, var_list=self.d_weights, global_step=global_step) self.softmax_grad = tf.gradients(self.softmax_loss, self.d_weights) def train_model_with_tensorflow(self): self.create_graph() self._sess = tf.Session() self._sess.run(tf.global_variables_initializer()) self.options, self.run_metadata = create_options_and_metadata(self) step, cont = 0, True disc_loss1, disc_loss2 = 0, 0 timee = time.time() while cont: try: if (self.model_params.model_type == "baseline"): _, disc_loss1 = training_step( self, [self.d_baseline, self.d_loss1]) elif (self.model_params.model_type == "softmax"): _, disc_loss1 = training_step( self, [self.d_softmax, self.softmax_loss]) elif (self.model_params.model_type == "MLE"): _, disc_loss1 = training_step( self, [self.d_mle, -self.mle_lossD]) else: _, disc_loss1, disc_loss2 = training_step( self, [self.d_train_adversarial, self.d_loss1, self.d_loss2]) self.Disc_loss1, self.Disc_loss2 = (self.Disc_loss1 + disc_loss1, self.Disc_loss2 + disc_loss2) if (math.isnan(disc_loss1)) or (math.isnan(disc_loss2)): cont = False if ((step > self.model_params.min_steps) and (early_stopping(self.dataD[0], 4) or early_stopping(self.dataD[2], 4))) or \ (step>self.model_params.max_steps): cont = False if (step % self.model_params.printing_step == 0): print(time.time() - timee) self.save_data(step) testing_step(self, step) if (step < 10): create_timeline_object(self) if (step % self.model_params.printing_step == 0): timee = time.time() step += 1 except KeyboardInterrupt: cont = False self.save_data(step) tf.reset_default_graph() def save_data(self, step): if (step % self.model_params.saving_step == 0): data = np.array(self.dataD) np.save(self.model_params.name, data)
embedded_a = embedded_attentive_a out_embeddings = embedding_tables[out_node_type]["out"] #out_embeddings = attentive_out_embeddings#embedding_tables[out_node_type]["out"] embedded_b = tf.nn.embedding_lookup(out_embeddings, b_placeholder) embedded_neg_b = tf.nn.embedding_lookup(out_embeddings, neg_b_placeholder) # embedded_b = embedded_attentive_b # embedded_neg_b = embedded_attentive_neg_b meta_losses = build_line_losses(embedded_a, embedded_b, embedded_neg_b, drop_rate_placeholder) + attention_func.l2_loss() * l2_coe # optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(meta_losses, var_list=attention_func.variables) var_list = None if out_node_type == TYPE_LABEL else attention_func.variables # optimizer = tf.train.AdamOptimizer(learning_rate=attention_decayed_learning_rate).minimize(meta_losses, var_list=var_list, global_step=global_step) optimizer = LazyAdamOptimizer(learning_rate=attention_learning_rate).minimize(meta_losses, var_list=var_list, global_step=global_step) # optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-2).minimize(meta_losses, # var_list=var_list, # global_step=global_step) # optimizer = tf.train.AdamOptimizer(learning_rate=5e-3).minimize(meta_losses, var_list=var_list)#, var_list=attention_func.variables + [label_out_embeddings]), # tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(meta_losses, var_list=embedding_vars) # optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(meta_losses)#, var_list=attention_func.variables) # optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(meta_losses) else: in_node_type, out_node_type = node_types in_embeddings = embedding_tables[in_node_type]["in"] out_embeddings = embedding_tables[out_node_type]["out"] meta_losses = build_line_losses_by_indices(in_embeddings, out_embeddings, a_placeholder, b_placeholder, neg_b_placeholder, drop_rate_placeholder)# + tf.nn.l2_loss(in_embeddings) * l2_coe
def estimator_model_fn(features, labels, mode, params): """ Parameters ---------- features : list Each entry is a dict sent to one of the towers. Keys are {uid, iid, delta_t, seq_lens, user_ids}. Vals are tf.float32/tf.int32 tensors with first dimension of size batch_size_for_one_tower. labels : list Each entry is a tensor sent to one of the towers. The tf.float32 tensor is of the shape batch_size_for_one_tower x timesteps. mode : tf.estimator.ModeKeys object Passed by Estimator - either TRAIN, EVAL or PREDICT params : tf.contrib.training.HParams object Contains all parameters for the run - extracted from json by init_basic_argument_parser Returns ------- tf.estimator.EstimatorSpec Object containing the built model """ # Hacky fix for model_fn accepting lists as features whereas serving_input_receiver_fn requires a dict # Assumes predictions are served with only one tower if type(features) != list: features = [features] labels = [labels] # Flag whether weights are provided as a part of the inputs use_weights = "weights" in features[0].keys() # tower_features and labels are lists of dicts. Each item in the list goes to one tower, # each entry in a dict is a pair in {uid, iid, delta_t, seq_lens, user_ids} and {y} and its batch tower_features = features tower_labels = labels num_gpus = params.num_gpu if mode != tf.estimator.ModeKeys.PREDICT else 0 # When not 1 GPU then always all results combined on CPU, if 1 GPU then combined on device according to param variable_strategy = "CPU" if (params.variable_strategy_CPU or mode == tf.estimator.ModeKeys.PREDICT) else "GPU" # Outputs of all towers tower_losses = [] tower_gradvars = [] tower_preds = [] # Devices on which towers are built are either CPU if no GPUs or GPU if any available if num_gpus == 0: num_devices = 1 device_type = "cpu" else: num_devices = num_gpus device_type = "gpu" # Build towers for i in range(num_devices): worker_device = "/{}:{}".format(device_type, i) # Strategy of instantiating variables on appropriate devices if variable_strategy == "CPU": device_setter = estimator_utils.local_device_setter( worker_device=worker_device) elif variable_strategy == 'GPU': device_setter = estimator_utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy( num_gpus, tf.contrib.training.byte_size_load_fn)) # Reuse variables between towers - only init once on the first tower with tf.variable_scope("model", reuse=bool(i != 0)): with tf.name_scope("tower_%d" % i) as name_scope: with tf.device(device_setter): # No labels available for PREDICT tower_labs_or_None = tower_labels[ i] if tower_labels else None # Parameters for regularisation regularization = { "user_reg_weight": params.user_reg_weight, "user_related_weights": params.user_related_weights } # Dict of outputs - always tower_predictions, gradvars and loss during training tower_outputs = _tower_fn( features=tower_features[i], labels=tower_labs_or_None, params=params, num_towers=num_devices, variable_strategy=variable_strategy, regularization=regularization, mode=mode) if mode == tf.estimator.ModeKeys.TRAIN: tower_gradvars.append(tower_outputs["gradvars"]) tower_losses.append(tower_outputs["tower_loss"]) if mode == tf.estimator.ModeKeys.EVAL: tower_losses.append(tower_outputs["tower_loss"]) tower_preds.append(tower_outputs["tower_predictions"]) if mode == tf.estimator.ModeKeys.PREDICT: tower_preds.append(tower_outputs["tower_predictions"]) # Combine the outputs on the master node consolidation_device = "/gpu:0" if variable_strategy == "GPU" else "/cpu:0" with tf.device(consolidation_device): if mode != tf.estimator.ModeKeys.TRAIN: preds = { k: tf.concat([x[k] for x in tower_preds], axis=0) for k in tower_preds[0].keys() } # Combine non-feature inputs from all towers with tf.name_scope("merge_tower_inputs"): stacked_seq_lens = tf.concat( [t["seq_lens"] for t in tower_features], axis=0) stacked_batch_user_ids = tf.concat( [t["uid"][:, 0] for t in tower_features], axis=0) stacked_weights = None if use_weights: stacked_weights = tf.concat( [t["weights"] for t in tower_features], axis=0) if mode == tf.estimator.ModeKeys.PREDICT: # If only interested in the last prediction (e.g. real recommendation) if params.last_prediction_only: # For each sequence slice the last real timestep # preds = {k: tf.gather_nd(v, stacked_seq_lens-1) for k, v in preds.items()} batch_size = tf.shape(stacked_seq_lens)[0] slices = tf.concat([ tf.expand_dims(tf.range(batch_size), 1), tf.expand_dims(stacked_seq_lens, 1) - 1 ], axis=1) preds = {k: tf.gather_nd(v, slices) for k, v in preds.items()} # If want recommendations to be traceable back to specific users if params.prediction_include_uid: preds = merge_dicts(preds, {"user_ids": stacked_batch_user_ids}) return tf.estimator.EstimatorSpec( mode, predictions=preds, export_outputs=None) #TODO Specify my own outputs # Counts of individual user interactions per tower - # used to offset effects of differing sequence lens on things like metrics with tf.name_scope("total_interactions_count"): # If using weights: sequence mask's 0 and weight non-1 values have to be accounted for if use_weights: sequence_mask = tf.sequence_mask( stacked_seq_lens, params.timesteps, dtype=tf.float32, name="total_interactions_seq_mask") total_interactions_op = tf.reduce_sum( tf.multiply(sequence_mask, stacked_weights), name="total_interactions_op_weights") else: total_interactions_op = tf.reduce_sum( stacked_seq_lens, name="total_interactions_op_no_weights") # Combine all labels from all towers with tf.name_scope("merge_tower_labels"): stacked_labels = tf.concat(labels, axis=0) # Calculate total batch loss with tf.name_scope("merge_tower_losses"): loss = reduce_tower_losses(tower_losses, total_interactions_op) if mode == tf.estimator.ModeKeys.TRAIN: # Calculate total gradients to apply (scaled by number of interactions in each batch) with tf.name_scope('average_gradients'): gradvars = average_gradients(tower_gradvars, total_interactions_op) with tf.device(consolidation_device): # Apply gradients with tf.name_scope("apply_gradients"): optimizer = LazyAdamOptimizer(params.learning_rate) # TODO Check if need params.sync train_op = optimizer.apply_gradients( gradvars, global_step=tf.train.get_global_step()) metrics = None else: with tf.device(consolidation_device): # Create a dict of metric_name: (metric_var, metric_update_op) with tf.name_scope("build_metrics"): metrics = build_metrics(labels=stacked_labels, predictions=preds["top_k"], seq_lens=stacked_seq_lens, batch_user_ids=stacked_batch_user_ids, params=params, input_top_k=True, weights=stacked_weights) train_op = None # Due to memory constraints loss is not recorded during evaluation, though it needs to be set to a tensor if params.zero_loss: with tf.name_scope("zero_loss"): loss = tf.constant(0) with tf.device(consolidation_device): # Count processing speed batch_size = params.train_batch_size if mode == tf.estimator.ModeKeys.TRAIN else params.validation_batch_size training_hooks = [ estimator_utils.ExamplesPerSecondHook(batch_size, every_n_steps=10) ] return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=metrics, training_chief_hooks=training_hooks)