def train_and_evaluate(train_model_spec, eval_model_spec, params): """ Train the model and evaluate every epoch. Args: train_model_spec: (dict) contains the graph operations or nodes needed for training eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation params: (dict) contains hyperparameters of the model. """ model_dir = params['model_dir'] eval_step = params['eval_model_step'] restore_from = params['restore_from'] # Initialize tf.Saver instances to save weights during training last_saver = tf.train.Saver() # will keep last 5 epochs begin_at_epoch = 0 with tf.Session() as sess: # Initialize model variables sess.run(train_model_spec['variable_init_op']) # Reaload weights from directory if specified if os.listdir(restore_from): logging.info("Restoring parameters from {}".format(restore_from)) restore_from = tf.train.latest_checkpoint(restore_from) begin_at_epoch = int(restore_from.split('-')[-1]) last_saver.restore(sess, restore_from) # For tensorboard (takes care of writing summaries to files) train_writer = tf.summary.FileWriter( os.path.join(model_dir, 'train_summaries'), sess.graph) for epoch in range(begin_at_epoch, params['num_epochs'] - begin_at_epoch): # Run one epoch logging.info("Epoch {}/{}".format( epoch + 1, params['num_epochs'] - begin_at_epoch)) # Compute number of batches in one epoch # (one full pass over the training set) num_steps = int(params['train_size'] / params['batch_size']) train_sess(sess, train_model_spec, num_steps, train_writer, params) # Save weights last_save_path = model_dir + '/last_weights/after-epoch' last_saver.save(sess, last_save_path, global_step=epoch + 1) # Evaluate for one step on validation set normalized_embedding_matrix = evaluate_sess( sess, eval_model_spec, eval_step, params) return normalized_embedding_matrix
def train_and_evaluation(train_model_spec, eval_model_spec, model_dir, params, restore_from=None): ''' Args: train_model_spec: (dict) contain graph and operation or nodes needed for training eval_model_spec: (dict) contain graph and operation or nodes needed for evaluation model_dir: path contain trained model restore_from: (string) dir or file contain weights to restore the graph ''' last_saver = tf.train.Saver() best_saver = tf.train.Saver(max_to_keep=1) begin_epoch = 0 with tf.Session as sess: sess.run(train_model_spec['variable_init_op']) if restore_from not None: logging.info('Restore parameter from {}'.format(restore_from)) if os.path.isdir(restore_from): restore_from = tf.train.latest_checkpoint(restore_from) begin_epoch = int(restore_from.split('-')[-1]) last_saver.restore(sess, restore_from) train_writer = tf.summary.FileWriter(os.path.join(model_dir, 'train_summary'), sess.graph) eval_writer = tf.summary.FileWriter(os.path.join(model_dir, 'eval_summary'), sess.graph) best_eval_acc= 0.0 for epoch in range(begin_epoch, begin_epoch+params.num_epochs): logging.info('Epoch {}/{}'.format(epoch+1, begin_epoch +params.num_epochs)) num_steps = (params.train_size + params.batches_size -1) // params.batches_size metrics = evaluate_sess(sess, eval_model_spec, num_step, eval_writer) # If best_val, best_save path eval_acc = metrics['accuracy'] if eval_acc >=best_eval_acc: best_eval_acc = eval_acc # Save weights best_save_path = os.path.join(model_dit , 'best_weights', 'after_epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=epoch+1) logging.info('- Found new best accuracy, saving in {}'.format(best_save_path)) # save best eval metrics best_json_path = os.path.join(model_dir, 'metrics_eval_best_weights.json') save_dict_to_json(metrics, best_json_path) last_json_path = os.path.join(model_dir, 'metrics_eval_last_weights.json') save_dict_to_json =(metrics, last_json_path)
def train_and_evaluate(train_model_spec, eval_model_spec, model_dir, params, restore_from=None): """Train the model and evaluate every epoch. Args: train_model_spec: (dict) contains the graph operations or nodes needed for training eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation model_dir: (string) directory containing config, weights and log params: (Params) contains hyperparameters of the model. Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps restore_from: (string) directory or file containing weights to restore the graph """ # Initialize tf.Saver instances to save weights during training last_saver = tf.train.Saver() # will keep last 5 epochs best_saver = tf.train.Saver( max_to_keep=1) # only keep 1 best checkpoint (best on eval) begin_at_epoch = 0 with tf.Session() as sess: # Initialize model variables sess.run(train_model_spec['variable_init_op']) # Reload weights from directory if specified if restore_from is not None: logging.info("Restoring parameters from {}".format(restore_from)) if os.path.isdir(restore_from): restore_from = tf.train.latest_checkpoint(restore_from) begin_at_epoch = int(restore_from.split('-')[-1]) last_saver.restore(sess, restore_from) # For tensorboard (takes care of writing summaries to files) train_writer = tf.summary.FileWriter( os.path.join(model_dir, 'train_summaries'), sess.graph) eval_writer = tf.summary.FileWriter( os.path.join(model_dir, 'eval_summaries'), sess.graph) best_eval_acc = 0.0 for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs): # Run one epoch logging.info("Epoch {}/{}".format( epoch + 1, begin_at_epoch + params.num_epochs)) # Compute number of batches in one epoch (one full pass over the training set) num_steps = (params.train_size + params.batch_size - 1) // params.batch_size train_sess(sess, train_model_spec, num_steps, train_writer, params) # Save weights last_save_path = os.path.join(model_dir, 'last_weights', 'after-epoch') last_saver.save(sess, last_save_path, global_step=epoch + 1) # Evaluate for one epoch on validation set num_steps = (params.eval_size + params.batch_size - 1) // params.batch_size metrics = evaluate_sess(sess, eval_model_spec, num_steps, eval_writer) # If best_eval, best_save_path eval_acc = metrics['loss'] if eval_acc >= best_eval_acc: # Store new best accuracy best_eval_acc = eval_acc # Save weights best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=epoch + 1) logging.info("- Found new best loss, saving in {}".format( best_save_path)) # Save best eval metrics in a json file in the model directory best_json_path = os.path.join( model_dir, "metrics_eval_best_weights.json") save_dict_to_json(metrics, best_json_path) # Save latest eval metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_eval_last_weights.json") save_dict_to_json(metrics, last_json_path)
def train_and_evaluate(train_model_spec, eval_model_spec, model_dir, params, learner_id=0, restore_from=None, global_epoch=1): """Train the model and evaluate every epoch. Args: train_model_spec: (dict) contains the graph operations or nodes needed for training eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation model_dir: (string) directory containing config, weights and log params: (Params) contains hyperparameters of the model. Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps restore_from: (string) directory or file containing weights to restore the graph """ # Initialize tf.Saver instances to save weights during training last_saver = tf.train.Saver() # will keep last 5 epochs best_saver = tf.train.Saver( max_to_keep=1) # only keep 1 best checkpoint (best on eval) begin_at_epoch = 0 with tf.Session() as sess: # Initialize model variables # tf.reset_default_graph() sess.run(train_model_spec['variable_init_op']) # For tensorboard (takes care of writing summaries to files) train_writer = tf.summary.FileWriter( os.path.join(model_dir, 'train_summaries'), sess.graph) eval_writer = tf.summary.FileWriter( os.path.join(model_dir, 'vali_summaries'), sess.graph) best_json_path = os.path.join(model_dir, "metrics_eval_best_weights.json") best_eval_metrics = [0.0, -float('inf')] # global_epoch = 0 # Reload weights from directory if specified # restor from the previous learner if restore_from is not None: save_path = os.path.join(model_dir, restore_from) if os.path.isdir(save_path): save_path = tf.train.latest_checkpoint(save_path) begin_at_epoch = int(save_path.split('-')[-1]) global_epoch = begin_at_epoch logging.info("Restoring parameters from {}".format(save_path)) # last_saver = tf.train.import_meta_graph(save_path+".meta") if params.loss_fn == 'retrain_regu_mine': pretrained_include = ['model/cnn'] elif params.loss_fn == 'cnn' and params.finetune: pretrained_include = ['model/cnn'] else: pretrained_include = ['model/c_cnn'] pretrained_include.append('model/cnn') # if params.loss_fn=='boost': # pretrained_include = ['model/boost'] # for i in range(1, learner_id): # pretrained_include.append('residual_mlp_{}'.format(learner_id)) pretrained_vars = tf.contrib.framework.get_variables_to_restore( include=pretrained_include) pretrained_saver = tf.train.Saver(pretrained_vars, name="pretrained_saver") pretrained_saver.restore(sess, save_path) # if params.num_learners > 1: # best_eval_metrics = load_best_metric(best_json_path) # best_eval_metrics = [best_eval_metrics['accuracy'], -best_eval_metrics['loss']] model_summary() # for each learner early_stopping_count = 0 for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs): if early_stopping_count == int(params.early_stoping_epochs): logging.info("Early stopping at learner {}, epoch {}/{}".format(learner_id, epoch + 1, \ begin_at_epoch + params.num_epochs)) break # Run one epoch logging.info("Learner {}, Epoch {}/{}".format(learner_id, epoch + 1, \ begin_at_epoch + params.num_epochs)) # logging.info(global_epoch) # Compute number of batches in one epoch (one full pass over the training set) num_steps = (params.train_size + params.batch_size - 1) // params.batch_size train_sess(sess, train_model_spec, num_steps, train_writer, params) # Save weights last_save_path = os.path.join(model_dir, 'last_weights', 'after-epoch') # global_epoch = int(params.num_learners) * int(params.num_epochs) + epoch + 1 last_saver.save(sess, last_save_path, global_step=global_epoch) # Evaluate for one epoch on validation set num_steps = (params.vali_size + params.batch_size - 1) // params.batch_size metrics = evaluate_sess(sess, eval_model_spec, num_steps, eval_writer, params) # If best_eval, best_save_path accuracy_metric = round(metrics['accuracy'], 6) loss_metric = -round(metrics['loss'], 6) # save_batch() eval_metrics = [accuracy_metric, loss_metric] # logging.info('global_epoch: {}, best_eval_metrics: {}, \ # eval_metric: {}', global_epoch, best_eval_metrics, eval_metric) if isSavingWeights(eval_metrics, best_eval_metrics): # rest early_stopping_count early_stopping_count = 0 # and isSavingWeights best_eval_metrics = eval_metrics # Save weights # trainalbe_vars = {v.name: v for v in tf.trainable_variables() if 'model' in v.name} # print(trainalbe_vars.keys()) if params.loss_fn == 'cnn' or params.loss_fn == 'retrain_regu': cnn_vars = [ v for v in tf.trainable_variables() if 'model/cnn' in v.name ] # c_cnn_vars=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/c_cnn') c_cnn_vars = [ v for v in tf.trainable_variables() if 'model/c_cnn' in v.name ] update_weights = [tf.assign(c, old) for (c, old) in \ zip(c_cnn_vars, cnn_vars)] sess.run(update_weights) ''' if params.loss_fn == 'boost': cnn_vars=[v for v in tf.trainable_variables() if 'model/cnn' in v.name] c_cnn_vars=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/c_cnn') update_weights = [tf.assign(c, old) for (c, old) in \ zip(c_cnn_vars, cnn_vars)] sess.run(update_weights) features = train_model_spec['features'] labels = train_model_spec['labels'] predicted_scores, _ = retrain_lenet(features, params, var_scope='model/c_cnn') residuals = get_residual(labels, predicted_scores) train_model_spec['old_predicted_scores'] = predicted_scores train_model_spec['residuals'] = residuals features = eval_model_spec['features'] labels = eval_model_spec['labels'] predicted_scores, _ = retrain_lenet(features, params, var_scope='model/c_cnn') residuals = get_residual(labels, predicted_scores) eval_model_spec['old_predicted_scores'] = predicted_scores eval_model_spec['residuals'] = residuals sess.run(train_model_spec['old_predicted_scores']) sess.run(train_model_spec['residuals']) sess.run(eval_model_spec['old_predicted_scores']) sess.run(eval_model_spec['residuals']) ''' best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') # global_epoch = int(params.num_learners) * int(params.num_epochs) + epoch + 1 best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch) logging.info( "- Found new best metric score, saving in {}".format( best_save_path)) # Save best eval metrics in a json file in the model directory save_dict_to_json(metrics, best_json_path) save_dict_to_json({'stopped_at_learner': learner_id}, \ os.path.join(model_dir, 'best_weights', 'learner.json')) else: early_stopping_count = early_stopping_count + 1 # Save latest eval metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_eval_last_weights.json") save_dict_to_json(metrics, last_json_path) global_epoch += 1 return global_epoch
def train_and_evaluate(train_model_spec, eval_model_spec, model_dir, params, restore_from=None, global_epoch=1): """Train the model and evaluate every epoch. Args: train_model_spec: (dict) contains the graph operations or nodes needed for training eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation model_dir: (string) directory containing config, weights and log params: (Params) contains hyperparameters of the model. Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps restore_from: (string) directory or file containing weights to restore the graph """ # Initialize tf.Saver instances to save weights during training last_saver = tf.train.Saver() # will keep last 5 epochs best_saver = tf.train.Saver( max_to_keep=1) # only keep 1 best checkpoint (best on eval) begin_at_epoch = 0 # MAB weight sampling num_clusters = params.num_clusters #10 rewards = [0] * num_clusters weight_numbers_of_selections = [0] * num_clusters weight_sums_of_reward = [0] * num_clusters weight_arm_weights = [1] * num_clusters weight_max_upper_bound = 0 old_index = 0 old_loss_val = 0 with tf.Session() as sess: # Initialize model variables sess.run(train_model_spec['variable_init_op']) # For tensorboard (takes care of writing summaries to files) train_writer = tf.summary.FileWriter( os.path.join(model_dir, 'train_summaries'), sess.graph) eval_writer = tf.summary.FileWriter( os.path.join(model_dir, 'vali_summaries'), sess.graph) best_json_path = os.path.join(model_dir, "metrics_eval_best_weights.json") best_eval_metrics = [0.0, -float('inf')] global_epoch = 0 # Reload weights from directory if specified # restor from the previous learner if restore_from is not None: save_path = os.path.join(model_dir, restore_from) if os.path.isdir(save_path): save_path = tf.train.latest_checkpoint(save_path) begin_at_epoch = int(save_path.split('-')[-1]) global_epoch = begin_at_epoch logging.info("Restoring parameters from {}".format(save_path)) pretrained_include = get_pretrained_include(params) pretrained_vars = tf.contrib.framework.get_variables_to_restore( include=pretrained_include) pretrained_saver = tf.train.Saver(pretrained_vars, name="pretrained_saver") pretrained_saver.restore(sess, save_path) # last_best_eval_metric = load_best_metric(best_json_path) # best_eval_metrics = [last_best_eval_metric['accuracy'], -last_best_eval_metric['loss']] logging.info(best_eval_metrics) model_summary() # for each learner num_train_steps = (params.train_size + params.batch_size - 1) // params.batch_size num_train_steps = int(num_train_steps) if params.finetune: # initial rewards for all arms for i in range(num_clusters): old_index = i _, _, weight_sums_of_reward, weight_arm_weights, weight_max_upper_bound, old_loss_val = train_initial_sess(sess, train_model_spec, num_train_steps, \ train_writer, params, old_index, weight_numbers_of_selections, weight_sums_of_reward, weight_arm_weights, weight_max_upper_bound, old_loss_val) # now real rl early_stopping_count = 0 # epoch_cut_off = int((begin_at_epoch + params.num_epochs) * params.epoch_cutoff) for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs): if early_stopping_count == int(params.early_stoping_epochs): logging.info("Early stopping at epoch {}/{}".format(epoch + 1, \ begin_at_epoch + params.num_epochs)) break # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, \ begin_at_epoch + params.num_epochs)) # Compute number of batches in one epoch (one full pass over the training set) # MAB data sampling batch_loss, old_index, weight_numbers_of_selections, weight_sums_of_reward, weight_arm_weights, weight_max_upper_bound, old_loss_val = train_sess(sess, train_model_spec, num_train_steps, \ train_writer, params, old_index, weight_numbers_of_selections, weight_sums_of_reward, weight_arm_weights, weight_max_upper_bound, old_loss_val) # sum_loss = batch_loss # # sum_loss = [s+n for (s, n) in zip(batch_loss, sum_loss)] # sum_loss = [float(v) for v in sum_loss] # # logging.info('sum_loss :\n {}'.format(sum_loss)) # consk = int(params.consk) # for i in range(num_train_steps): # index, reward, numbers_of_selections, sums_of_reward, \ # max_upper_bound = rl(params, sum_loss, numbers_of_selections, \ # sums_of_reward, max_upper_bound, \ # (epoch - begin_at_epoch + 1) / consk, arm_weights) # if params.rl == 'EXP3': # arm_weights = sums_of_reward # # logging.info('numbers_of_selections at i:\n {}'.format(numbers_of_selections)) # total_reward += reward # Save weights # if epoch >= epoch_cut_off: # # cnn_vars=[v for v in tf.trainable_variables() if 'model/cnn/weights1_1' in v.name] # # cnn_vars = tf.get_variable('model/cnn/weights1_1') # save_var(sess, 'weights1_1', epoch) # save_var(sess, 'weights1_2', epoch) # # save_var(sess, 'weights3_1', epoch) # save_var(sess, 'weights3_2', epoch) save_var(sess, 'weights1_1', epoch) save_var(sess, 'weights1_2', epoch) save_var(sess, 'weights3_1', epoch) save_var(sess, 'weights3_2', epoch) last_save_path = os.path.join(model_dir, 'last_weights', 'after-epoch') last_saver.save(sess, last_save_path, global_step=global_epoch) # # Evaluate for one epoch on validation set num_vali_steps = (params.vali_size + params.batch_size - 1) // params.batch_size num_vali_steps = int(num_vali_steps) metrics = evaluate_sess(sess, eval_model_spec, num_vali_steps, eval_writer, params) # If best_eval, best_save_path accuracy_metric = round(metrics['accuracy'], 6) loss_metric = -round(metrics['loss'], 6) # save_batch() eval_metrics = [accuracy_metric, loss_metric] # logging.info('global_epoch: {}, best_eval_metrics: {}, \ # eval_metric: {}', global_epoch, best_eval_metrics, eval_metric) # logging.info('isSavingWeights(eval_metrics, best_eval_metrics) {}'.\ # format(isSavingWeights(eval_metrics, best_eval_metrics))) if isSavingWeights(eval_metrics, best_eval_metrics): # rest early_stopping_count early_stopping_count = 0 # and isSavingWeights best_eval_metrics = eval_metrics # Save weights if params.loss_fn == 'cnn' and not params.use_kfac: cnn_vars = [ v for v in tf.trainable_variables() if 'model/cnn' in v.name ] c_cnn_vars = [ v for v in tf.trainable_variables() if 'model/c_cnn' in v.name ] update_weights = [tf.assign(c, old) for (c, old) in \ zip(c_cnn_vars, cnn_vars)] sess.run(update_weights) best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch) logging.info( "- Make a copy of cnn vars, saving in {}".format( best_save_path)) elif params.loss_fn == 'retrain_regu_mine3': # c_cnn_vars=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/cnn') c_cnn_vars = [ v for v in tf.trainable_variables() if 'model/cnn' in v.name ] cnn_vars = [ v for v in tf.trainable_variables() if 'model/mask' in v.name ] update_weights = [tf.assign(c, tf.multiply(old, c)) for (c, old) in \ zip(c_cnn_vars, cnn_vars)] sess.run(update_weights) best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch) logging.info("- Updated cnn vars, saving in {}".format( best_save_path)) best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch) logging.info( "- Found new best metric score, saving in {}".format( best_save_path)) # Save best eval metrics in a json file in the model directory save_dict_to_json(metrics, best_json_path) else: early_stopping_count = early_stopping_count + 1 # Save latest eval metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_eval_last_weights.json") save_dict_to_json(metrics, last_json_path) global_epoch += 1 # update in the end is wrong as not the best weights are copied ''' if params.loss_fn == 'cnn' and not params.use_kfac: cnn_vars=[v for v in tf.trainable_variables() if 'model/cnn' in v.name] c_cnn_vars=[v for v in tf.trainable_variables() if 'model/c_cnn' in v.name] update_weights = [tf.assign(c, old) for (c, old) in \ zip(c_cnn_vars, cnn_vars)] sess.run(update_weights) best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch) logging.info("- Make a copy of cnn vars, saving in {}".format(best_save_path)) elif params.loss_fn == 'retrain_regu_mine3': # c_cnn_vars=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/cnn') c_cnn_vars=[v for v in tf.trainable_variables() if 'model/cnn' in v.name] cnn_vars=[v for v in tf.trainable_variables() if 'model/mask' in v.name] update_weights = [tf.assign(c, tf.multiply(old, c)) for (c, old) in \ zip(c_cnn_vars, cnn_vars)] sess.run(update_weights) best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch) logging.info("- Updated cnn vars, saving in {}".format(best_save_path)) ''' begin_at_epoch = global_epoch early_stopping_count = 0 sum_loss = [0] * num_train_steps numbers_of_selections = [0] * num_train_steps # UCB specific sums_of_reward = [0] * num_train_steps arm_weights = [1] * num_train_steps # UCB specific max_upper_bound = 0 total_reward = 0 for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs2): # if early_stopping_count == int(params.early_stoping_epochs): # logging.info("Early stopping at epoch {}/{}".format(epoch + 1, \ # begin_at_epoch + params.num_epochs)) # break # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, \ begin_at_epoch + params.num_epochs2)) # MAB data sampling batch_loss, old_index, weight_numbers_of_selections, weight_sums_of_reward, weight_arm_weights, weight_max_upper_bound, old_loss_val = train_sess(sess, train_model_spec, num_train_steps, \ train_writer, params, old_index, weight_numbers_of_selections, weight_sums_of_reward, weight_arm_weights, weight_max_upper_bound, old_loss_val) sum_loss = batch_loss consk = int(params.consk) # logging.info('sum_loss :\n {}, length: {}'.format(sum_loss, len(sum_loss))) # logging.info('sample numbers_of_selections at i:\n {}, length: {}'.format(numbers_of_selections, len(numbers_of_selections))) for i in range(num_train_steps): index, reward, numbers_of_selections, sums_of_reward, \ max_upper_bound = rl(params, sum_loss, numbers_of_selections, \ sums_of_reward, max_upper_bound, \ (epoch - begin_at_epoch + 1) / consk, arm_weights) # logging.info('sample numbers_of_selections at i:\n {}'.format(numbers_of_selections)) total_reward += reward # logging.info('len of sum_loss: {}'.format(len(sum_loss))) # Save weights last_save_path = os.path.join(model_dir, 'last_weights', 'after-epoch') # global_epoch = int(params.num_learners) * int(params.num_epochs) + epoch + 1 last_saver.save(sess, last_save_path, global_step=global_epoch) metrics = evaluate_sess(sess, eval_model_spec, num_vali_steps, eval_writer, params) # If best_eval, best_save_path accuracy_metric = round(metrics['accuracy'], 6) loss_metric = -round(metrics['loss'], 6) # save_batch() eval_metrics = [accuracy_metric, loss_metric] # logging.info('global_epoch: {}, best_eval_metrics: {}, \ # eval_metric: {}', global_epoch, best_eval_metrics, eval_metric) if isSavingWeights(eval_metrics, best_eval_metrics): # rest early_stopping_count early_stopping_count = 0 # and isSavingWeights best_eval_metrics = eval_metrics # Save weights # trainalbe_vars = {v.name: v for v in tf.trainable_variables() if 'model' in v.name} # print(trainalbe_vars.keys()) if params.loss_fn == 'cnn' and not params.use_kfac: cnn_vars = [ v for v in tf.trainable_variables() if 'model/cnn' in v.name ] c_cnn_vars = [ v for v in tf.trainable_variables() if 'model/c_cnn' in v.name ] update_weights = [tf.assign(c, old) for (c, old) in \ zip(c_cnn_vars, cnn_vars)] sess.run(update_weights) best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') # global_epoch = int(params.num_learners) * int(params.num_epochs) + epoch + 1 best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch) logging.info( "- Found new best metric score, saving in {}".format( best_save_path)) # Save best eval metrics in a json file in the model directory save_dict_to_json(metrics, best_json_path) else: early_stopping_count = early_stopping_count + 1 # Save latest eval metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_eval_last_weights.json") save_dict_to_json(metrics, last_json_path) global_epoch += 1 # logging.info('num_vali_steps: {}'.format(num_vali_steps)) # logging.info('len of sum_loss: {}'.format(len(sum_loss))) # logging.info('numbers_of_selections:\n {}'.format(numbers_of_selections)) logging.info( 'numbers_of_selections:\n {}'.format(numbers_of_selections)) logging.info('weight_numbers_of_selections:\n {}'.format( weight_numbers_of_selections)) sorted_index = sorted(range(num_train_steps), key=lambda k: numbers_of_selections[k], reverse=True) # top_sorted_index = sorted_index[0: int(num_train_steps*params.top_ratio)+1] sample_batchs = (params.sample_size + params.batch_size - 1) // params.batch_size top_sorted_index = sorted_index[0:int(sample_batchs) + 1] logging.info('len(top_sorted_index) in training: {}'.format( len(top_sorted_index))) take_train_samples_sess(sess, eval_model_spec, num_train_steps, params, top_sorted_index) return global_epoch
def train_and_evaluate(train_model_spec, eval_model_spec, model_dir, params, learner_id=0, restore_from=None, global_epoch=1): """Train the model and evaluate every epoch. Args: train_model_spec: (dict) contains the graph operations or nodes needed for training eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation model_dir: (string) directory containing config, weights and log params: (Params) contains hyperparameters of the model. Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps restore_from: (string) directory or file containing weights to restore the graph """ # Initialize tf.Saver instances to save weights during training last_saver = tf.train.Saver() # will keep last 5 epochs best_saver = tf.train.Saver( max_to_keep=1) # only keep 1 best checkpoint (best on eval) begin_at_epoch = 0 with tf.Session() as sess: # Initialize model variables sess.run(train_model_spec['variable_init_op']) # For tensorboard (takes care of writing summaries to files) train_writer = tf.summary.FileWriter( os.path.join(model_dir, 'train_summaries'), sess.graph) eval_writer = tf.summary.FileWriter( os.path.join(model_dir, 'vali_summaries'), sess.graph) best_json_path = os.path.join(model_dir, "metrics_eval_best_weights.json") best_eval_metric = 0.0 # ndcg_1 # best_loss_metric = float('inf') second_eval_metric = 0.0 # ndcg_3 third_eval_metric = 0.0 # ndcg_5 forth_eval_metric = 0.0 # ndcg_10 # global_epoch = 0 # Reload weights from directory if specified # restor from the previous learner if restore_from is not None: save_path = os.path.join(model_dir, restore_from) if os.path.isdir(save_path): save_path = tf.train.latest_checkpoint(save_path) # begin_at_epoch = int(restore_from.split('-')[-1]) logging.info("Restoring parameters from {}".format(save_path)) # last_saver = tf.train.import_meta_graph(save_path+".meta") pretrained_include = ['model/mlp'] if params.loss_fn == 'urrank': pretrained_include = ['model/ur'] for i in range(1, learner_id): pretrained_include.append('residual_mlp_{}'.format(learner_id)) pretrained_vars = tf.contrib.framework.get_variables_to_restore( include=pretrained_include) pretrained_saver = tf.train.Saver(pretrained_vars, name="pretrained_saver") pretrained_saver.restore(sess, save_path) best_eval_metric, second_eval_metric, third_eval_metric, forth_eval_metric = \ load_best_ndcgs(best_json_path) # for each learner early_stopping_count = 0 for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs): if early_stopping_count == int(params.early_stoping_epochs): logging.info("Early stopping at learner {}, epoch {}/{}".format(learner_id, epoch + 1, \ begin_at_epoch + params.num_epochs)) break # Run one epoch logging.info("Learner {}, Epoch {}/{}".format(learner_id, epoch + 1, \ begin_at_epoch + params.num_epochs)) # Compute number of batches in one epoch (one full pass over the training set) num_steps = (params.train_size + params.batch_size - 1) // params.batch_size train_sess(sess, train_model_spec, num_steps, train_writer, params) # Save weights last_save_path = os.path.join(model_dir, 'last_weights', 'after-epoch') # global_epoch = int(params.num_learners) * int(params.num_epochs) + epoch + 1 last_saver.save(sess, last_save_path, global_step=global_epoch) # Evaluate for one epoch on validation set num_steps = (params.eval_size + params.batch_size - 1) // params.batch_size metrics = evaluate_sess(sess, eval_model_spec, num_steps, eval_writer, params) # If best_eval, best_save_path # eval_metric = metrics['dcg'] # loss_metric = metrics['loss'] eval_metric = round(metrics['ndcg_1'], 6) eval_metric_2 = round(metrics['ndcg_3'], 6) eval_metric_3 = round(metrics['ndcg_5'], 6) eval_metric_4 = round(metrics['ndcg_10'], 6) # eval_metric = metrics['ndcg_1'] # eval_metric_2 = metrics['ndcg_3'] # eval_metric_3 = metrics['ndcg_5'] # eval_metric_4 = metrics['ndcg_10'] eval_metrics = [ eval_metric, eval_metric_2, eval_metric_3, eval_metric_4 ] best_eval_metrics = [best_eval_metric, second_eval_metric, third_eval_metric, \ forth_eval_metric] if isSavingWeights(eval_metrics, best_eval_metrics): # rest early_stopping_count early_stopping_count = 0 # Store new best ndcg_1 # this worsk better than eval_metric > best_eval_metric # and isSavingWeights best_eval_metric = eval_metric # loss_metric = best_loss_metric second_eval_metric = eval_metric_2 third_eval_metric = eval_metric_3 forth_eval_metric = eval_metric_4 # Save weights best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') # global_epoch = int(params.num_learners) * int(params.num_epochs) + epoch + 1 best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch) logging.info( "- Found new best metric score, saving in {}".format( best_save_path)) # Save best eval metrics in a json file in the model directory save_dict_to_json(metrics, best_json_path) save_dict_to_json({'stopped_at_learner': learner_id}, \ os.path.join(model_dir, 'best_weights', 'learner.json')) else: early_stopping_count = early_stopping_count + 1 # Save latest eval metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_eval_last_weights.json") save_dict_to_json(metrics, last_json_path) global_epoch += 1 return global_epoch
def train_and_evaluate(train_model_spec, eval_model_spec, model_dir, params, restore_from=None): """Train the model and evaluate every epoch. Args: train_model_spec: (dict) contains the graph operations or nodes needed for training eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation model_dir: (string) directory containing config, weights and log params: (Params) contains hyperparameters of the model. Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps restore_from: (string) directory or file containing weights to restore the graph """ # Initialize tf.Saver instances to save weights during training last_saver = tf.train.Saver() # will keep last 5 epochs best_saver = tf.train.Saver( max_to_keep=1) # only keep 1 best checkpoint (best on eval) begin_at_epoch = 0 with tf.Session() as sess: # Initialize model variables sess.run(train_model_spec['variable_init_op']) # Reload weights from directory if specified if restore_from is not None: logging.info("Restoring parameters from {}".format(restore_from)) if os.path.isdir(restore_from): restore_from = tf.train.latest_checkpoint(restore_from) begin_at_epoch = int(restore_from.split('-')[-1]) last_saver.restore(sess, restore_from) # For tensorboard (takes care of writing summaries to files) train_writer = tf.summary.FileWriter( os.path.join(model_dir, 'train_summaries'), sess.graph) eval_writer = tf.summary.FileWriter( os.path.join(model_dir, 'vali_summaries'), sess.graph) best_eval_metric = 0.0 # ndcg_1 # best_loss_metric = float('inf') second_eval_metric = 0.0 # ndcg_3 third_eval_metric = 0.0 # ndcg_5 forth_eval_metric = 0.0 # ndcg_10 early_stopping_count = 0 for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs): if early_stopping_count == 200: logging.info("Early stopping at epoch {}/{}".format( epoch + 1, begin_at_epoch + params.num_epochs)) break # Run one epoch logging.info("Epoch {}/{}".format( epoch + 1, begin_at_epoch + params.num_epochs)) # Compute number of batches in one epoch (one full pass over the training set) num_steps = (params.train_size + params.batch_size - 1) // params.batch_size train_sess(sess, train_model_spec, num_steps, train_writer, params) # Save weights last_save_path = os.path.join(model_dir, 'last_weights', 'after-epoch') last_saver.save(sess, last_save_path, global_step=epoch + 1) # Evaluate for one epoch on validation set num_steps = (params.eval_size + params.batch_size - 1) // params.batch_size metrics = evaluate_sess(sess, eval_model_spec, num_steps, eval_writer) # If best_eval, best_save_path # eval_metric = metrics['dcg'] eval_metric = metrics['ndcg_1'] # loss_metric = metrics['loss'] eval_metric_2 = metrics['ndcg_3'] eval_metric_3 = metrics['ndcg_5'] eval_metric_4 = metrics['ndcg_10'] eval_metrics = [ eval_metric, eval_metric_2, eval_metric_3, eval_metric_4 ] best_eval_metrics = [ best_eval_metric, second_eval_metric, third_eval_metric, forth_eval_metric ] if isSavingWeights(eval_metrics, best_eval_metrics): # rest early_stopping_count early_stopping_count = 0 # Store new best ndcg_1 # this worsk better than eval_metric > best_eval_metric # and isSavingWeights best_eval_metric = eval_metric # loss_metric = best_loss_metric second_eval_metric = eval_metric_2 third_eval_metric = eval_metric_3 forth_eval_metric = eval_metric_4 # Save weights best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=epoch + 1) logging.info( "- Found new best metric score, saving in {}".format( best_save_path)) # Save best eval metrics in a json file in the model directory best_json_path = os.path.join( model_dir, "metrics_eval_best_weights.json") save_dict_to_json(metrics, best_json_path) else: early_stopping_count = early_stopping_count + 1 # Save latest eval metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_eval_last_weights.json") save_dict_to_json(metrics, last_json_path)
def train_and_evaluate(train_model_spec, eval_model_spec, model_dir, params, restore_from=None, global_epoch=1): """Train the model and evaluate every epoch. Args: train_model_spec: (dict) contains the graph operations or nodes needed for training eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation model_dir: (string) directory containing config, weights and log params: (Params) contains hyperparameters of the model. Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps restore_from: (string) directory or file containing weights to restore the graph """ # Initialize tf.Saver instances to save weights during training last_saver = tf.train.Saver() # will keep last 5 epochs best_saver = tf.train.Saver( max_to_keep=1) # only keep 1 best checkpoint (best on eval) begin_at_epoch = 0 with tf.Session() as sess: # Initialize model variables sess.run(train_model_spec['variable_init_op']) # For tensorboard (takes care of writing summaries to files) train_writer = tf.summary.FileWriter( os.path.join(model_dir, 'train_summaries'), sess.graph) eval_writer = tf.summary.FileWriter( os.path.join(model_dir, 'vali_summaries'), sess.graph) best_json_path = os.path.join(model_dir, "metrics_eval_best_weights.json") best_eval_metrics = [0.0, -float('inf')] global_epoch = 0 # Reload weights from directory if specified # restor from the previous learner if restore_from is not None: save_path = os.path.join(model_dir, restore_from) if os.path.isdir(save_path): save_path = tf.train.latest_checkpoint(save_path) begin_at_epoch = int(save_path.split('-')[-1]) global_epoch = begin_at_epoch logging.info("Restoring parameters from {}".format(save_path)) pretrained_include = get_pretrained_include(params) pretrained_vars = tf.contrib.framework.get_variables_to_restore( include=pretrained_include) pretrained_saver = tf.train.Saver(pretrained_vars, name="pretrained_saver") pretrained_saver.restore(sess, save_path) # last_best_eval_metric = load_best_metric(best_json_path) # best_eval_metrics = [last_best_eval_metric['accuracy'], -last_best_eval_metric['loss']] logging.info(best_eval_metrics) model_summary() # for each learner early_stopping_count = 0 for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs): if early_stopping_count == int(params.early_stoping_epochs): logging.info("Early stopping at epoch {}/{}".format(epoch + 1, \ begin_at_epoch + params.num_epochs)) break # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, \ begin_at_epoch + params.num_epochs)) # Compute number of batches in one epoch (one full pass over the training set) num_steps = (params.train_size + params.batch_size - 1) // params.batch_size train_sess(sess, train_model_spec, num_steps, train_writer, params) # Save weights last_save_path = os.path.join(model_dir, 'last_weights', 'after-epoch') last_saver.save(sess, last_save_path, global_step=global_epoch) # # Evaluate for one epoch on validation set num_steps = (params.vali_size + params.batch_size - 1) // params.batch_size metrics = evaluate_sess(sess, eval_model_spec, num_steps, eval_writer, params) # If best_eval, best_save_path accuracy_metric = round(metrics['accuracy'], 6) loss_metric = -round(metrics['loss'], 6) # save_batch() eval_metrics = [accuracy_metric, loss_metric] # logging.info('global_epoch: {}, best_eval_metrics: {}, \ # eval_metric: {}', global_epoch, best_eval_metrics, eval_metric) # logging.info('isSavingWeights(eval_metrics, best_eval_metrics) {}'.\ # format(isSavingWeights(eval_metrics, best_eval_metrics))) if isSavingWeights(eval_metrics, best_eval_metrics): # rest early_stopping_count early_stopping_count = 0 # and isSavingWeights best_eval_metrics = eval_metrics # Save weights if params.loss_fn == 'cnn' and not params.use_kfac: cnn_vars = [ v for v in tf.trainable_variables() if 'model/cnn' in v.name ] c_cnn_vars = [ v for v in tf.trainable_variables() if 'model/c_cnn' in v.name ] update_weights = [tf.assign(c, old) for (c, old) in \ zip(c_cnn_vars, cnn_vars)] sess.run(update_weights) best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch) logging.info( "- Make a copy of cnn vars, saving in {}".format( best_save_path)) elif params.loss_fn == 'retrain_regu_mine3': # c_cnn_vars=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/cnn') c_cnn_vars = [ v for v in tf.trainable_variables() if 'model/cnn' in v.name ] cnn_vars = [ v for v in tf.trainable_variables() if 'model/mask' in v.name ] update_weights = [tf.assign(c, tf.multiply(old, c)) for (c, old) in \ zip(c_cnn_vars, cnn_vars)] sess.run(update_weights) best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch) logging.info("- Updated cnn vars, saving in {}".format( best_save_path)) best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch) logging.info( "- Found new best metric score, saving in {}".format( best_save_path)) # Save best eval metrics in a json file in the model directory save_dict_to_json(metrics, best_json_path) else: early_stopping_count = early_stopping_count + 1 # Save latest eval metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_eval_last_weights.json") save_dict_to_json(metrics, last_json_path) global_epoch += 1 # update in the end is wrong as not the best weights are copied ''' if params.loss_fn == 'cnn' and not params.use_kfac: cnn_vars=[v for v in tf.trainable_variables() if 'model/cnn' in v.name] c_cnn_vars=[v for v in tf.trainable_variables() if 'model/c_cnn' in v.name] update_weights = [tf.assign(c, old) for (c, old) in \ zip(c_cnn_vars, cnn_vars)] sess.run(update_weights) best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch) logging.info("- Make a copy of cnn vars, saving in {}".format(best_save_path)) elif params.loss_fn == 'retrain_regu_mine3': # c_cnn_vars=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/cnn') c_cnn_vars=[v for v in tf.trainable_variables() if 'model/cnn' in v.name] cnn_vars=[v for v in tf.trainable_variables() if 'model/mask' in v.name] update_weights = [tf.assign(c, tf.multiply(old, c)) for (c, old) in \ zip(c_cnn_vars, cnn_vars)] sess.run(update_weights) best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch) logging.info("- Updated cnn vars, saving in {}".format(best_save_path)) ''' return global_epoch
def train_and_evaluate(train_model_specs, eval_model_specs, model_dir, params, restore_from=None): """Train the model and evaluate every epoch. Args: train_model_specs: (dict) contains the graph operations or nodes needed for training eval_model_specs: (dict) contains the graph operations or nodes needed for evaluation model_dir: (string) directory containing config, weights and log params: contains hyperparameters of the model. Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps restore_from: (string) directory or file containing weights to restore the graph. """ # Initialize tf.Saver() instances to save weights during training last_saver = tf.train.Saver() # will keep last 5 epochs best_saver = tf.train.Saver( max_to_keep=1) # only keep 1 best checkpoint (based on eval) begin_at_epoch = 0 with tf.Session() as sess: # Initialize model vairables sess.run(train_model_specs['variable_init_op']) # Load the mobilenet pretrain weights train_model_specs['mobilenet_init_op'](sess) # Reload weights from directory if specified if restore_from is not None: if os.path.isdir(restore_from): restore_from = tf.train.latest_checkpoint(restore_from) begin_at_epoch = int(restore_from.split('-')[-1]) last_saver.restore(sess, restore_from) # Create summary writer for train and eval train_writer = tf.summary.FileWriter( os.path.join(model_dir, 'train_summaries'), sess.graph) eval_writer = tf.summary.FileWriter( os.path.join(model_dir, 'eval_summaries'), sess.graph) best_eval_loss = 1000 for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs): # Run one epoch logging.info("Epoch {}/{}".format( epoch + 1, begin_at_epoch + params.num_epochs)) num_steps = (params.train_size + params.batch_size - 1) // params.batch_size train_sess(sess, train_model_specs, num_steps, params, train_writer) # Save weights last_save_path = os.path.join(model_dir, 'last_weights', 'after-epoch') last_saver.save(sess, last_save_path, global_step=epoch + 1) # Evaluate for one epoch on validation set num_steps = (params.eval_size + params.batch_size - 1) // params.batch_size metrics = evaluate_sess(sess, eval_model_specs, num_steps) # If best_loss, best_save_path eval_loss = metrics['loss'] if eval_loss <= best_eval_loss: # Store new best loss best_eval_loss = eval_loss # Save weights best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') best_save_path = best_saver.save(sess, best_save_path, global_step=epoch + 1) logging.info("- Found new best accuracy, saving in {}".format( best_save_path)) best_json_path = os.path.join( model_dir, "metrics_eval_best_weights.json") save_dict_to_json(metrics, best_json_path) # save lastest eval metric in a json file in model directory last_json_path = os.path.join(model_dir, "metrics_eval_last_weights.json") save_dict_to_json(metrics, last_json_path)