def set_debugger_session(): sess = K.get_session() sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.add_tensor_filter('has_inf_or_nan', has_inf_or_nan) K.set_session(sess)
def train(is_debug=False): print ('Folder:'+str(fold)) model = get_model() sess = tf.Session() if is_debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) sess.run(tf.global_variables_initializer()) train_data,test_data = get_data(fold) train_data_ed = data_pipeline(train_data) test_data_ed = data_pipeline(test_data) word2index, slot2index, intent2index, = get_info_from_training_data(train_data_ed) index_train = to_index(train_data_ed, word2index, slot2index, intent2index) index_test = to_index(test_data_ed, word2index, slot2index, intent2index) best_ep=0 best_sl_acc=0 best_f1_score=0 for epoch in range(epoch_num): mean_loss = 0.0 train_loss = 0.0 tic = time.time() bar = progressbar.ProgressBar(maxval=(len(index_train)/batch_size) ,widgets=[("[Epoch {}] >>Training ".format(epoch)),progressbar.Bar('#', '[', ']'), ' ', progressbar.Percentage()]) bar.start() for i, batch in enumerate(getBatch(batch_size, index_train)): # Perform a batch training _, loss, decoder_prediction, intent, mask, slot_W = model.step(sess, "train", batch) mean_loss += loss train_loss += loss train_loss /= (i + 1) bar.update(i+1) bar.finish() sys.stdout.flush() print('Training completed in {:.2f} (sec)'.format(time.time()-tic)) # One epoch per training, test once pred_slots = [] slot_accs = [] for j, batch in enumerate(getBatch(batch_size, index_test)): decoder_prediction, intent = model.step(sess, "test", batch) decoder_prediction = np.transpose(decoder_prediction, [1, 0]) slot_pred_length = list(np.shape(decoder_prediction))[1] pred_padded = np.lib.pad(decoder_prediction, ((0, 0), (0, input_steps-slot_pred_length)), mode="constant", constant_values=0) pred_slots.append(pred_padded) true_slot = np.array((list(zip(*batch))[2])) true_length = np.array((list(zip(*batch))[1])) true_slot = true_slot[:, :slot_pred_length] slot_acc = accuracy_score(true_slot, decoder_prediction, true_length) slot_accs.append(slot_acc) pred_slots_a = np.vstack(pred_slots) true_slots_a = np.array(list(zip(*index_test))[2])[:pred_slots_a.shape[0]] f1_score=f1_for_sequence_batch(true_slots_a, pred_slots_a) print("Slot accuracy for epoch {}: {:.3f}".format(epoch, np.average(slot_accs)*100)) print("Slot F1 score for epoch {}: {:.3f}".format(epoch,f1_score*100 )) if (f1_score >best_f1_score): best_ep=epoch best_sl_acc=np.average(slot_accs) best_f1_score=f1_score print('\nBEST RESULT: epoch {}, valid accurasy {:.3f}, best test F1 score {:.3f}'.format(best_ep,best_sl_acc*100,best_f1_score*100)) sess.close() with open('results.txt', 'a') as outfile: outfile.write('For Folder:'+str(fold)+' using '+str(cell)+' cell, BEST RESULT: epoch '+ str(best_ep)+ ', valid score {:.3f}, best test F1 score {:.3f}'.format( best_sl_acc*100,best_f1_score*100)+'\n')
def __init__(self, config, model_args=[]): ''' Initialize a TFModelABC. model_args is a list of strings specifying the names of required arguments specific to the model. ''' # Validate and load args for arg_name in self.global_args + model_args: assert (arg_name in config.keys()) self.config = deepcopy(config) # Set up logging self.checkpoint_dir = os.path.join(config['save_dir'], 'checkpoints/') self.tf_log_dir = os.path.join(config['save_dir'], 'tflogs/') os.makedirs(self.checkpoint_dir, exist_ok=True) os.makedirs(self.tf_log_dir, exist_ok=True) self.logger = logging.Logger(config['model_name'] + '_logger', level=logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(levelname)s: %(message)s') log_fh = logging.FileHandler( os.path.join(config['save_dir'], 'logs.log')) log_fh.setLevel(logging.INFO) log_fh.setFormatter(formatter) debug_fh = logging.FileHandler( os.path.join(config['save_dir'], 'debug.log')) debug_fh.setLevel(logging.DEBUG) debug_fh.setFormatter(formatter) print_fh = logging.StreamHandler() print_fh.setFormatter(formatter) print_fh.setLevel( logging.DEBUG if self.config['debug_mode'] else logging.INFO) self.logger.addHandler(debug_fh) self.logger.addHandler(log_fh) self.logger.addHandler(print_fh) self.logger.debug('loading card ID mappings') map_base = 'datasets/code_mappings/{}_{}.pkl'.format( '{}', config['n_cards']) with open(map_base.format('encoding'), 'rb') as f: self.name_to_id = pickle.load(f) with open(map_base.format('decoding'), 'rb') as f: self.id_to_name = pickle.load(f) self.logger.debug('configuring session and base graph') self.graph = tf.Graph() tf.set_random_seed(config['random_seed']) session_config = tf.ConfigProto(gpu_options=tf.GPUOptions()) with self.graph.as_default(): self.sess = tf.Session(graph=self.graph, config=session_config) if self.config['debug_mode']: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.logger.debug('building base ops') self.global_step = tf.Variable(0, name='global_step', trainable=False) # TODO: remove when done with tf.name_scope('debug'): self.debug_feed = tf.placeholder(tf.int32, [], 'debug_feed') self.logger.debug('building model graph') self.build_graph() self.logger.debug('initializing all variables') self.sess.run(tf.global_variables_initializer()) try: self.init() self.logger.debug('running model-defined init()') except NotImplementedError: self.logger.warning('no model-defined init() found')
def main(): start_time = time.time() logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.register("type", "bool", lambda v: v.lower() == "true") parser.add_argument("--max_epochs", type=int, default=9, help="Number of epochs training is run.") parser.add_argument("--batch_size", type=int, default=64, help="Batch size used during training.") parser.add_argument("--learning_rate", type=float, default=0.05, help="Initial learning rate.") parser.add_argument("--data_dir_tr", type=str, default="cityscapesExtractedResized", help="Directory of the training data.") parser.add_argument("--data_dir_val", type=str, default="cityscapesExtractedValResized", help="Directory of the validation data.") parser.add_argument("--data_dir_test", type=str, default="cityscapesExtractedTestResized", help="Directory of the test data.") parser.add_argument("--debug", type="bool", nargs='?', const=True, default=False, help="Use debugger to track down bad values during training.") parser.add_argument("--validate_every", type=int, nargs=1, default=2, help="Run validation every x epochs.") parser.add_argument("--run_training", type="bool", nargs='?', const=True, default=False, help="Training loop is run") parser.add_argument("--run_test", type="bool", nargs='?', const=True, default=False, help="Run testing (after training if training is demanded") parser.add_argument("--save_model_name", type=str, default='models/'+datetime.datetime.now().strftime("%Y-%m-%d_%H:%M"), help="File the model is saved to") parser.add_argument("--restore_model", type=str, default=None, help="File the model is restore from") parser.add_argument("--train_log_dir", type=str, default='log/'+datetime.datetime.now().strftime("%Y-%m-%d_%H:%M"), help="Directory for training logs") FLAGS, unparsed = parser.parse_known_args() validate_every = FLAGS.validate_every if type(FLAGS.validate_every) is int else FLAGS.validate_every[0] with tf.Session() as sess: # Get datasets for training, validation and test data_train, data_train_filtered_names = input_cs.get_dataset_cs(FLAGS.data_dir_tr, FLAGS.max_epochs, FLAGS.batch_size) data_val, _ = input_cs.get_dataset_cs(FLAGS.data_dir_val, 1, 128) data_test, _ = input_cs.get_dataset_cs(FLAGS.data_dir_test, 1, 128) # Get iterators for training validation, and test # Define handle for switching between dataset iterator handle = tf.placeholder(dtype=tf.string, shape=[]) iterator = tf.data.Iterator.from_string_handle(handle, data_train.output_types, data_train.output_shapes) next_element = iterator.get_next() imgs, labels = next_element training_iterator = data_train.make_one_shot_iterator() validation_iterator = data_val.make_initializable_iterator() validation_initializer = validation_iterator.initializer test_iterator = data_test.make_initializable_iterator() test_initializer = test_iterator.initializer training_handle = sess.run(training_iterator.string_handle()) validation_handle = sess.run(validation_iterator.string_handle()) test_handle = sess.run(test_iterator.string_handle()) logits = model.build_model(imgs, input_cs.NUM_CLASSES) tf.summary.histogram(logits.op.name + '/activations', logits) # Start tensorflow debug session if FLAGS.debug: logging.info("Start debug session") sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) if FLAGS.run_training: train.train(sess, FLAGS.max_epochs, FLAGS.batch_size, validate_every, FLAGS.learning_rate, logits, labels, handle, training_handle, validation_handle, validation_initializer, model_file=None, save_name=FLAGS.save_model_name, train_log_dir=FLAGS.train_log_dir) if FLAGS.run_test: if FLAGS.run_training: run_test(sess, logits, labels, handle, test_handle, test_initializer) else: if FLAGS.restore_model: test_model_file(sess, FLAGS.restore_model, logits, labels, handle, test_handle, test_initializer) else: logging.error("Cannot test: No model trained or specified for restoring for testing") logging.info("Run time: %s" % (time.time() - start_time))
def set_debugger_session(): sess = K.get_session() sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.add_tensor_filter('name_filter', name_filter) K.set_session(sess)
# Create a summary to monitor accuracy tensor tf.summary.scalar("accuracy", mean_acc) # Create a summary to monitor weight tf.summary.histogram("weight", w1) # Merge all summaries into a single op merged_summary_op = tf.summary.merge_all() # Declare a tf.train.Saver to save model saver = tf.train.Saver() # Start training with tf.Session() as sess: # Run the initializer sess.run(init) debug_sess = tf_debug.LocalCLIDebugWrapperSession(sess=sess) # One-hot encoding Labels y_train = sess.run(y_train1) y_test = sess.run(y_test1) # op to write logs to Tensorboard summary_writer = tf.summary.FileWriter(logs_path, graph=sess.graph) # Training cycle for epoch in range(training_epochs): avg_cost = 0. total_batch = int(mnist.train.num_examples / batch_size) # Loop over all batches for batch in range(total_batch): if batch == total_batch - 1: a, c, summary = sess.run(
def __run_train(self): sess_config = tf.ConfigProto(allow_soft_placement=True) with tf_debug.LocalCLIDebugWrapperSession( tf.Session(config=sess_config)) as self.tf_session: self.tf_session.run(tf.global_variables_initializer()) if self.restore_file: tools.printf("Restoring model weights from %s..." % self.restore_file) self.tf_saver_restore.restore(self.tf_session, self.restore_file) else: if self.cfg.use_init and self.cfg.only_train_init: raise ValueError( "Set to only train initializer, but restore file was not provided!?!?!" ) tools.printf("Initializing variables...") # initialize tensorboard writer self.tf_tb_writer = tf.summary.FileWriter( os.path.join(self.results_dir_path, 'graph_viz')) self.tf_tb_writer.add_graph(tf.get_default_graph()) self.tf_tb_writer.flush() # initialize lstm and ekf states curr_lstm_states = np.zeros([ 2, self.cfg.lstm_layers, self.cfg.batch_size, self.cfg.lstm_size ], dtype=np.float32) curr_ekf_state = np.zeros([self.cfg.batch_size, 17], dtype=np.float32) curr_ekf_cov_state = np.repeat(np.expand_dims(np.identity( 17, dtype=np.float32), axis=0), repeats=self.cfg.batch_size, axis=0) lstm_states_dic = {} ekf_states_dic = {} ekf_cov_states_dic = {} for seq in self.train_sequences: lstm_states_dic[seq] = np.zeros([ self.train_data_gen.batch_counts[seq], 2, self.cfg.lstm_layers, self.cfg.batch_size, self.cfg.lstm_size ], dtype=np.float32) ekf_states_dic[seq] = np.zeros([ self.train_data_gen.batch_counts[seq], self.cfg.batch_size, 17 ], dtype=np.float32) ekf_cov_states_dic[seq] = np.repeat( np.expand_dims(curr_ekf_cov_state, axis=0), self.train_data_gen.batch_counts[seq], axis=0) _train_image_summary = None total_batches = self.train_data_gen.total_batches() best_val_loss = 9999999999 i_epoch = 0 for i_epoch in range(self.start_epoch, self.cfg.num_epochs): tools.printf("Training Epoch: %d ..." % i_epoch) start_time = time.time() alpha_set = Train.__set_from_schedule(self.cfg.alpha_schedule, i_epoch) lr_set = Train.__set_from_schedule(self.cfg.lr_schedule, i_epoch) tools.printf("alpha set to %f" % alpha_set) tools.printf("learning rate set to %f" % lr_set) while self.train_data_gen.has_next_batch(): j_batch = self.train_data_gen.curr_batch() # get inputs batch_id, curr_seq, batch_data, fc_ground_truth, se3_ground_truth, imu_measurements = self.train_data_gen.next_batch( ) data_roller.get_init_lstm_state(lstm_states_dic, curr_lstm_states, curr_seq, batch_id, self.cfg.bidir_aug) data_roller.get_init_ekf_states(ekf_states_dic, ekf_cov_states_dic, curr_ekf_state, curr_ekf_cov_state, curr_seq, batch_id, self.cfg.bidir_aug) # shift se3 ground truth to be relative to the first pose init_poses = se3_ground_truth[0, :, :] nrnd = np.random.rand(1) use_init_train = False if j_batch == 0 or nrnd < self.cfg.init_prob: use_init_train = True # Run training session _, _curr_lstm_states, _curr_ekf_states, _curr_ekf_covar, _train_summary, _train_image_summary, _total_losses = \ self.tf_session.run( [self.op_trainer, self.t_lstm_states, self.t_ekf_states, self.t_ekf_covar_states, self.op_train_merged_summary, self.op_train_image_summary, self.t_total_loss], feed_dict={ self.t_inputs: batch_data, self.t_se3_labels: se3_ground_truth[1:, :, :], self.t_fc_labels: fc_ground_truth, self.t_lstm_initial_state: curr_lstm_states, self.t_initial_poses: init_poses, self.t_lr: lr_set, self.t_alpha: alpha_set, self.t_is_training: True, self.t_use_initializer: use_init_train, self.t_sequence_id: int(curr_seq), self.t_epoch: i_epoch, self.t_ekf_initial_state: curr_ekf_state, self.t_ekf_initial_covariance: curr_ekf_cov_state, self.t_imu_data: imu_measurements }, options=self.tf_run_options, run_metadata=self.tf_run_metadata) data_roller.update_lstm_state(lstm_states_dic, _curr_lstm_states, curr_seq, batch_id) data_roller.update_ekf_state(ekf_states_dic, ekf_cov_states_dic, _curr_ekf_states, _curr_ekf_covar, curr_seq, batch_id) if self.tensorboard_meta: self.tf_tb_writer.add_run_metadata( self.tf_run_metadata, 'epochid=%d_batchid=%d' % (i_epoch, j_batch)) self.tf_tb_writer.add_summary( _train_summary, i_epoch * total_batches + j_batch) # print stats tools.printf("batch %d/%d: Loss:%.7f" % (j_batch + 1, total_batches, _total_losses)) self.tf_tb_writer.add_summary(_train_image_summary, (i_epoch + 1) * total_batches) tools.printf("Evaluating validation loss...") curr_val_loss = self.__run_val_loss(i_epoch, alpha_set) # check for best results if curr_val_loss < best_val_loss: tools.printf("Saving best result...") best_val_loss = curr_val_loss self.tf_saver_best.save( self.tf_session, os.path.join(self.results_dir_path, "best_val", "model_best_val_checkpoint"), global_step=i_epoch) tools.printf("Best val loss, model saved.") if i_epoch % 5 == 0: tools.printf("Saving checkpoint...") self.tf_saver_checkpoint.save( self.tf_session, os.path.join(self.results_dir_path, "model_epoch_checkpoint"), global_step=i_epoch) tools.printf("Checkpoint saved") self.tf_tb_writer.flush() tools.printf("ave_val_loss(se3): %f, time: %f\n" % (curr_val_loss, time.time() - start_time)) self.train_data_gen.next_epoch() tools.printf("Final save...") self.tf_saver_checkpoint.save(self.tf_session, os.path.join( self.results_dir_path, "model_epoch_checkpoint"), global_step=i_epoch) tools.printf("Saved results to %s" % self.results_dir_path) self.tf_session.close()
def main(): # Get token for Slacker token = os.environ['SLACK_API_TOKEN'] slack = Slacker(token=token) # Create subdirs for each run in experiment if os.path.exists(summary_base): run_dirs = [ os.path.join(summary_base, d) for d in os.listdir(summary_base) ] latest_run = int(max(run_dirs, key=os.path.getmtime).split('-')[-1]) summary_path = os.path.join(summary_base, 'run-%d' % (latest_run + 1)) else: summary_path = os.path.join(summary_base, 'run-1') # Overwrite experiment summaries if args.train and args.ow: shutil.rmtree(ckpt_path, ignore_errors=True) shutil.rmtree(summary_path, ignore_errors=True) # Create envs size_px = (args.res, args.res) env_args = dict(map_name=args.map, step_mul=args.step_mul, game_steps_per_episode=0, screen_size_px=size_px, minimap_size_px=size_px) vis_env_args = env_args.copy() vis_env_args['visualize'] = args.vis num_vis = min(args.envs, args.max_windows) env_fns = [partial(make_sc2env, **vis_env_args)] * num_vis num_no_vis = args.envs - num_vis if num_no_vis > 0: env_fns.extend([partial(make_sc2env, **env_args)] * num_no_vis) envs = SubprocVecEnv(env_fns) # Start tensorflow session if not args.lstm: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.475) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) else: sess = tf.Session() if args.tfdbg: sess = tfdbg.LocalCLIDebugWrapperSession(sess) summary_writer = tf.summary.FileWriter(summary_path) network_data_format = 'NCHW' if args.nchw else 'NHWC' # XXX NHWC -> NCHW # Create A2CAgent instance agent = A2CAgent(sess=sess, debug=args.debug, network_data_format=network_data_format, value_loss_weight=args.value_loss_weight, entropy_weight=args.entropy_weight, learning_rate=args.lr, max_to_keep=args.max_to_keep, lstm=args.lstm) # Setup A2CAgent runner runner = A2CRunner(envs=envs, agent=agent, slack=slack, train=args.train, summary_writer=summary_writer, discount=args.discount, n_steps=args.steps_per_batch) # Build A2CAgent graphs static_shape_channels = runner.preproc.get_input_channels() agent.build(static_shape_channels, resolution=args.res) # Load the latest ckpt if os.path.exists(ckpt_path): agent.load(ckpt_path) load = True else: agent.init() load = False runner.reset(nenvs=args.envs, res=args.res) # Start Train/Eval i = agent.train_step if load else 0 total_frames = 0 try: while True: write_summary = args.train and i % args.summary_iters == 0 if i > 0 and i % args.save_iters == 0: _save_if_training(agent, summary_writer) result, total_frames = runner.run_batch( total_frames, train_summary=write_summary, lstm=args.lstm) # Debug return if args.debug and result == None: warning = 'Bad numerics detected by Tensorflow API!' + \ 'Stopping the environment...' send_notification(slack, message=warning, channel='#sc2') break if write_summary: agent_step, loss, summary = result summary_writer.add_summary(summary, global_step=agent_step) print('iter %d: loss = %f' % (agent_step, loss)) #if args.train and isnan(loss): # warning = 'NaN output detected from loss!' + \ # 'Stopping the SC2 environment...' # print(warning) # send_notification(slack, message=warning, channel='#sc2') # break i += 1 if 0 <= args.num_timesteps <= total_frames: log = 'Agent has finished training! Saving and closing the SC2 environment..' print(log) send_notification(slack, message=log, channel='#sc2') break except KeyboardInterrupt: pass # Save the model ckpt _save_if_training(agent, summary_writer) envs.close() summary_writer.close() print('mean score: %f' % runner.get_mean_score())
#fileEM_name = fileEM_name.replace(':', '-').replace(' ', '_') #fileEM = open(logDirectory + fileEM_name, "w") logEM_file_path = os.path.join(log_dir, "logEM.txt") f = open(logEM_file_path, "w").close() os.chmod(logEM_file_path, 0o777) fileEM = open(logEM_file_path, "w") fileEM.write("Hyperparameters:" + str(ARGS)) param_file_path = os.path.join(log_dir, "params_used.txt") with open(param_file_path, "w") as param_file: for key in sorted(vars(ARGS).keys()): param_file.write(str(key) + ": " + str(vars(ARGS)[key]) + "\n") os.chmod(param_file_path, 0o777) if ARGS.tfdbg: chosen_session = tf_debug.LocalCLIDebugWrapperSession(tf.Session()) else: chosen_session = tf.Session() with chosen_session as sess: if ARGS.restore == None: sess.run(tf.global_variables_initializer()) else: #restore_path = saver.restore(sess, os.path.join(checkpoint_dir, ARGS.restore)) restore_path = saver.restore(sess, ARGS.restore) print("Restoring from checkpoint at", restore_path) train_start_time = time.time() print("Graph-build time: ", utils.time_format(train_start_time - start_time)) dataset_length = len(input_d_vecs) num_batches = dataset_length // ARGS.batch_size
def test_model(): with tf.Graph().as_default(): validation_dataset = np.load('path/to/dataset') validation_data_gen = DataGenerator(validation_dataset, batch_size=cfg.batch_size) inputs, labels = placeholder_inputs(cfg.batch_size, cfg.num_frames) is_training_pl = tf.placeholder(tf.bool, shape=()) keep_prob_pl = tf.placeholder(tf.float32) global_step = tf.Variable(0, dtype=tf.int64) bn_decay = get_bn_decay(global_step) tf.summary.scalar('bn_decay', bn_decay) # Get model and loss pred = build_graph(inputs, is_training_pl, weight_decay=cfg.weight_decay, keep_prob=keep_prob_pl, bn_decay=bn_decay) loss = get_loss(pred, labels) tf.summary.scalar('total_loss', loss) # raise correct = tf.equal(tf.argmax(pred, 1), tf.to_int64(labels)) correct = tf.reduce_sum(tf.cast(correct, tf.float32)) accuracy = correct / float(cfg.batch_size) tf.summary.scalar('accuracy', accuracy) # Get training operator learning_rate = get_learning_rate(global_step) tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate) # optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True # config.log_device_placement = True # config.gpu_options.allocator_type = "BFC" sess = tf.Session(config=config) # # restore model ################# # load_model_path = LOGDIR+'/model_epoch_{}'.format(cfg.load_model_epoch) load_model_path = LOGDIR + '/model_epoch_{}'.format( cfg.load_model_epoch) try: saver = tf.train.Saver() saver.restore(sess, load_model_path) print("\nLoaded previous model... ", load_model_path) except Exception as e: raise if cfg.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) # Plot Variable Histogram t_vars = tf.trainable_variables() # for var in t_vars: # tf.summary.histogram(var.op.name, var) # saver = tf.train.Saver() merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(LOGDIR + '/train') train_writer.add_graph(tf.get_default_graph()) test_writer = tf.summary.FileWriter(LOGDIR + '/test') test_writer.add_graph(tf.get_default_graph()) # Count number of trainable parameters num_params = np.sum([np.prod(v.get_shape().as_list()) for v in t_vars]) print( '************ The Number of Trainable Parameters: {} ************'. format(num_params)) num_g_params = np.sum( [np.prod(v.get_shape().as_list()) for v in tf.global_variables()]) print('************ The Number of Global Parameters: {} ************'. format(num_g_params)) ops = { 'inputs_pl': inputs, 'labels_pl': labels, 'keep_prob_pl': keep_prob_pl, 'is_training_pl': is_training_pl, 'pred': pred, 'loss': loss, 'train_op': train_op, 'merged': merged, 'step': global_step } # validation_data_gen = DataGenerator(validation_dataset, batch_size=cfg.batch_size) print('Validating ...') val_one_epoch(sess, validation_data_gen, ops, test_writer, logging=False)
def train(): log_string('***** Config *****') log_string('***** Building Point {}...'.format(MODEL_NAME)) log_string('** num_frames: {}'.format(cfg.num_frames)) log_string('** num_classes: {}'.format(cfg.num_classes)) log_string('** batch_size: {}'.format(cfg.batch_size)) log_string('** epoch: {}'.format(cfg.epoch)) log_string('** init_learning_rate: {}'.format(cfg.init_learning_rate)) log_string('** decay_step: {}'.format(cfg.decay_step)) log_string('** decay_rate: {}'.format(cfg.decay_rate)) log_string('** weight_decay: {}'.format(cfg.weight_decay)) with tf.Graph().as_default(): inputs, labels = placeholder_inputs(cfg.batch_size, cfg.num_frames) is_training_pl = tf.placeholder(tf.bool, shape=()) keep_prob_pl = tf.placeholder(tf.float32) global_step = tf.Variable(0, dtype=tf.int64) bn_decay = get_bn_decay(global_step) tf.summary.scalar('bn_decay', bn_decay) pred = build_graph(inputs, is_training_pl, weight_decay=cfg.weight_decay, keep_prob=keep_prob_pl, bn_decay=bn_decay) loss = get_loss(pred, labels) # raise tf.summary.scalar('total_loss', loss) correct = tf.equal(tf.argmax(pred, 1), tf.to_int64(labels)) correct = tf.reduce_sum(tf.cast(correct, tf.float32)) accuracy = correct / float(cfg.batch_size) tf.summary.scalar('accuracy', accuracy) # Get training operator learning_rate = get_learning_rate(global_step) tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate) # optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True # config.log_device_placement = True # config.gpu_options.allocator_type = "BFC" sess = tf.Session(config=config) # # restore model ################# load_model_path = LOGDIR + '/model_epoch_{}'.format( cfg.load_model_epoch) try: saver = tf.train.Saver() saver.restore(sess, load_model_path) print("\nPrevious model restored... ", load_model_path) except Exception as e: print("\nCannot find the requested model... {}".format(e)) sess.run(tf.global_variables_initializer()) # %% create a saver object saver = tf.train.Saver() print("\nCreating new model...", load_model_path) if cfg.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) # init = tf.global_variables_initializer() # sess.run(init, {is_training_pl: True}) # saver = tf.train.Saver() # Plot Variable Histogram t_vars = tf.trainable_variables() merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(LOGDIR + '/train') train_writer.add_graph(tf.get_default_graph()) test_writer = tf.summary.FileWriter(LOGDIR + '/test') test_writer.add_graph(tf.get_default_graph()) # running_vars = tf.get_collection('metric_vars') # running_vars = tf.get_collection(tf.GraphKeys.METRIC_VARIABLES) # running_vars = [ var for var in running_vars if isinstance(var, tf.Variable)] # print(running_vars) # running_vars_initializer = tf.variables_initializer(var_list=running_vars) # Count number of trainable parameters num_params = np.sum([np.prod(v.get_shape().as_list()) for v in t_vars]) log_string( '************ The Number of Trainable Parameters: {} ************'. format(num_params)) num_g_params = np.sum( [np.prod(v.get_shape().as_list()) for v in tf.global_variables()]) log_string( '************ The Number of Global Parameters: {} ************'. format(num_g_params)) ops = { 'inputs_pl': inputs, 'labels_pl': labels, 'is_training_pl': is_training_pl, 'keep_prob_pl': keep_prob_pl, 'pred': pred, 'loss': loss, 'train_op': train_op, 'merged': merged, 'step': global_step } training_dataset = np.load( '/media/tjosh/vault/MSRAction3D/new_pc_npy_5_training.npy') validation_dataset = np.load( '/media/tjosh/vault/MSRAction3D/new_pc_npy_5_validation.npy') # set_size = len(dataset) # dataset = shuffle(dataset) # training_dataset = dataset[:int(set_size*0.67)] # validation_dataset = dataset[int(set_size*0.67):] validation_dataset = validation_dataset train_data_gen = DataGenerator(training_dataset, batch_size=cfg.batch_size) validation_data_gen = DataGenerator(validation_dataset, batch_size=cfg.batch_size, augment=False) for epoch in range(1, cfg.epoch + 1): log_string('\n******** Training:---Epoch_{}/{} *********'.format( epoch, cfg.epoch)) log_string('Training ...') train_one_epoch(sess, train_data_gen, ops, train_writer) log_string('Validating ...') val_one_epoch(sess, validation_data_gen, ops, test_writer) if epoch % cfg.save_model_freq == 0: saver.save(sess, LOGDIR + '/model_epoch_{}'.format(epoch)) log_string('Model saved at epoch {}'.format(epoch))
def regression_model(train_input, test_input, model_name, load=False, nb_epoch=0): print('Setting up...May take one minute...') # pdb.set_trace() seq_train, fix_train, can_train, label_train = train_input seq_test, fix_test, can_test, label_test = test_input fix_train = np.reshape(fix_train, (-1, chunk_size, 1)) fix_test = np.reshape(fix_test, (-1, chunk_size, 1)) can_train = np.reshape(can_train, (-1, chunk_size, 1)) can_test = np.reshape(can_test, (-1, chunk_size, 1)) input_seq = tf.placeholder(tf.float32, shape=[None, n_steps, 4]) input_seq_3 = tf.placeholder(tf.float32, shape=[None, n_steps - 2, 64]) input_seq_5 = tf.placeholder(tf.float32, shape=[None, n_steps - 4, 1024]) kr = tf.placeholder(tf.float32) y = tf.placeholder(tf.float32, shape=output_shape) phase = tf.placeholder(tf.bool, name='phase') pred = model_graph(input_seq, input_seq_3, input_seq_5, kr, phase) # learning rate decay global_step = tf.Variable(0, trainable=False) # weight decay loss = tf.reduce_mean( tf.losses.mean_squared_error(predictions=pred, labels=tf.reshape(y, [-1]))) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss, global_step=global_step) # for monitoring tf.summary.scalar('loss', loss) # tf.summary.scalar('accuracy', accuracy) merged = tf.summary.merge_all() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # define the tensorboard writer if load == False: os.system('rm {}/train/*'.format(summary_dir)) os.system('rm {}/test/*'.format(summary_dir)) train_writer = tf.summary.FileWriter(summary_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(summary_dir + '/test') if debug_mode: sess = tf_debug.LocalCLIDebugWrapperSession(sess) init = tf.global_variables_initializer() sess.run(init) print('Graph initialized!') saver = tf.train.Saver() if load: saver.restore(sess, model_name) def model_whole_set_check(seq, fix, can, y_input, batch_size=64): result_pred = list() result_fix = list() result_true = list() seq_obj = batch_object(seq, batch_size) fix_obj = batch_object(fix, batch_size) can_obj = batch_object(can, batch_size) y_obj = batch_object(y_input, batch_size) # pdb.set_trace() for step in range(int(len(seq) / batch_size)): seq_batch = seq_obj.next_batch() fix_batch = fix_obj.next_batch() can_batch = can_obj.next_batch() y_batch = y_obj.next_batch() # y_batch = np.reshape(y_batch, [-1]) seq_batch_3 = seq_3_encode_list(seq_batch) seq_batch_5 = seq_5_encode_list(seq_batch) temp = sess.run(pred, feed_dict={ input_seq: seq_batch, input_seq_3: seq_batch_3, input_seq_5: seq_batch_5, y: y_batch, kr: 1, phase: 0 }) result_fix += list(np.reshape(fix_batch, [-1])) result_true += list(np.reshape(y_batch, [-1])) result_pred += list(temp) if step % 10 == 0: print('We are in step %d.' % step) # pdb.set_trace() print('The result of fix to true is {}'.format( mean_squared_error(result_true, result_fix))) print('The result of pred to true is {}'.format( mean_squared_error(result_pred, result_true))) return result_pred, result_true # define the training and test part acc_step = 0 print('Total epoch: {}'.format(nb_epoch)) #pdb.set_trace() for epoch in range(nb_epoch): for step in range(int(len(seq_train) / batch_size) + 1): # seq_train_batch = seq_train_obj.next_batch() # fix_train_batch = fix_train_obj.next_batch() # can_train_batch = can_train_obj.next_batch() # y_train_batch = y_train_obj.next_batch() x_train_list, y_train_batch = generate_random_batch( [seq_train, fix_train, can_train], label_train, batch_size) # y_train_batch = np.reshape(y_train_batch, [-1]) seq_train_1 = x_train_list[0] # seq_train_1 = np.reshape(seq_train_1, [-1, 104, 4, 1]) seq_train_3 = seq_3_encode_list(x_train_list[0]) # seq_train_3 = np.reshape(seq_train_3, [-1, 102, 64, 1]) seq_train_5 = seq_5_encode_list(x_train_list[0]) # seq_train_5 = np.reshape(seq_train_5, [-1, 100, 1024, 1]) sess.run(optimizer, feed_dict={ input_seq: seq_train_1, input_seq_3: seq_train_3, input_seq_5: seq_train_5, y: y_train_batch, kr: 0.7, phase: 1 }) if step % output_step == 0: summary, loss_out = sess.run( [merged, loss], feed_dict={ input_seq: seq_train_1, input_seq_3: seq_train_3, input_seq_5: seq_train_5, y: y_train_batch, kr: 1, phase: 0 }) train_writer.add_summary(summary, acc_step) # print('Train step %d'%step) # print('Train loss: %f, train acc: %f'%(loss_out, acc)) print('Epoch: %d, train step %d, loss %f' % (epoch, step, loss_out)) x_test_list, y_test_batch = generate_random_batch( [seq_test, fix_test, can_test], label_test, batch_size) # y_test_batch = np.reshape(y_test_batch, [-1]) seq_test_1 = x_test_list[0] # seq_test_1 = np.reshape(seq_test_1, [-1, 104, 4, 1]) seq_test_3 = seq_3_encode_list(x_test_list[0]) # seq_test_3 = np.reshape(seq_test_3, [-1, 102, 64, 1]) seq_test_5 = seq_5_encode_list(x_test_list[0]) # seq_test_5 = np.reshape(seq_test_5, [-1, 100, 1024, 1]) summary = sess.run(merged, feed_dict={ input_seq: seq_test_1, input_seq_3: seq_test_3, input_seq_5: seq_test_5, y: y_test_batch, kr: 1, phase: 0 }) test_writer.add_summary(summary, acc_step) # print('Test loss: %f, test acc: %f'%(loss_out, acc)) acc_step = acc_step + 1 saver.save(sess, model_name) result_pred, result_true = model_whole_set_check(seq_test, fix_test, can_test, label_test, 256) return result_pred, result_true
def train(is_debug=False): model = get_model() sess = tf.Session() if is_debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) sess.run(tf.global_variables_initializer()) if (use_neg_data): train_data = open("dataset/atis-2.train.w-intent_with_neg.iob", "r").readlines() else: train_data = open("dataset/atis-2.train.w-intent.iob", "r").readlines() test_data = open("dataset/atis-2.dev.w-intent.iob", "r").readlines() train_data_ed = data_pipeline(train_data) test_data_ed = data_pipeline(test_data) word2index, index2word, intent2index, index2intent = get_info_from_training_data( train_data_ed) index_train = to_index(train_data_ed, word2index, intent2index) index_test = to_index(test_data_ed, word2index, intent2index) print("%20s%20s%20s" % ("Epoch#", "Train Loss", "Intent Accuracy")) def add_to_vocab_file(fh, data): all = set() bsize = 16 for i, batch in enumerate(getBatch(bsize, data)): for index in range(len(batch)): sen_len = batch[index][1] current_vocabs = index_seq2word(batch[index][0], index2word)[:sen_len] for w in current_vocabs: if (w in all): continue f_vocab_list.write(w + "\n") all.add(w) def add_to_intent_file(fh, data): all = set() bsize = 16 for i, batch in enumerate(getBatch(bsize, data)): for index in range(len(batch)): sen_len = batch[index][1] w = index2intent[batch[index][2]] if (w in all): continue f_vocab_list.write(w + "\n") all.add(w) f_vocab_list = open("vocab_list.in", "w") add_to_vocab_file(f_vocab_list, index_train) add_to_vocab_file(f_vocab_list, index_test) f_vocab_list.close() f_vocab_list = open("intent_list.in", "w") add_to_intent_file(f_vocab_list, index_train) add_to_intent_file(f_vocab_list, index_test) f_vocab_list.close() # saver = tf.train.Saver() for epoch in range(epoch_num): mean_loss = 0.0 train_loss = 0.0 for i, batch in enumerate(getBatch(batch_size, index_train)): _, loss, intent, _ = model.step(sess, "train", batch) train_loss += loss train_loss /= (i + 1) intent_accs = [] for j, batch in enumerate(getBatch(batch_size, index_test)): intent, _ = model.step(sess, "test", batch) intent_acc = accuracy_score(list(zip(*batch))[2], intent) intent_accs.append(intent_acc) print("%20d%20f%20f" % (epoch, train_loss, np.average(intent_accs))) print("Training auto-encoder...") print("%20s%20s%20s%20s%20s" % ("Epoch#", "Train Loss", "Neg Data Loss", "Good Data Loss", "Ratio")) ae_model = AutoEncoder(model) ae_model.tf_init(sess) if (train_ae): for epoch in range(epoch_num_ae): mean_loss = 0.0 train_loss = 0.0 for i, batch in enumerate(getBatch(batch_size, index_train)): intent, _, _, output_true, output_layer, loss, _ = ae_model.step( sess, "train", batch) train_loss += loss train_loss /= (i + 1) result1, result2 = run_batch_test(ae_model, sess, word2index, index2intent, index_test, epoch) r = (result1 - result2) / result1 * 100 print("%20d%20f%20f%20f%20f" % (epoch, train_loss, result1, result2, r)) else: run_batch_test(ae_model, sess, word2index, index2intent, index_test, 0)
def train(Model, BatchedInput, hparams): hparams.beam_width = 0 graph = tf.Graph() mode = tf.estimator.ModeKeys.TRAIN with graph.as_default(): # new_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder/correction') # init_new_vars = tf.initialize_variables(new_vars) trainer = Trainer(hparams, Model, BatchedInput, mode) trainer.build_model(eval=argval("eval") != 0) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True sess = tf.Session(graph=graph, config=config) if FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) load_model(sess, Model, hparams) if argval("simulated"): # not real training, only to export values utils.prepare_output_path(hparams) sess.run(tf.assign(trainer._global_step, 0)) sess.run(tf.assign(trainer._processed_inputs_count, 0)) trainer.init(sess) # tensorboard log if FLAGS.reset: if os.path.exists(hparams.summaries_dir): shutil.rmtree(hparams.summaries_dir) writer = tf.summary.FileWriter(os.path.join(hparams.summaries_dir), sess.graph) last_save_step = trainer.global_step last_eval_pos = trainer.global_step - FLAGS.eval def reset_pbar(): epoch_batch_count = trainer.data_size // trainer.step_size pbar = tqdm(total=epoch_batch_count, ncols=150, unit="step", initial=trainer.epoch_progress) pbar.set_description('Epoch %i' % trainer.epoch) return pbar pbar = reset_pbar() last_epoch = trainer.epoch dev_lers = {} min_dev_lers = {} test_lers = {} min_test_lers = {} min_dev_test_lers = {} trainer.reset_train_iterator(sess) while True: # utils.update_hparams(FLAGS, hparams) # renew hparams so paramters can be changed during training # eval if needed if argval("eval") > 0 and argval( "eval_from") <= trainer.epoch_exact: if trainer.global_step - last_eval_pos >= FLAGS.eval: pbar.set_postfix_str("Evaluating (dev)...") dev_lers = trainer.eval_all(sess, dev=True) pbar.set_postfix_str("Evaluating (test)...") test_lers = trainer.eval_all(sess, dev=False) for acc_id in test_lers: if dev_lers is None: if acc_id not in min_test_lers or min_test_lers[ acc_id] > test_lers[acc_id]: min_test_lers[acc_id] = test_lers[acc_id] save(hparams, sess, "best_%d" % acc_id) else: if acc_id not in min_test_lers or min_test_lers[ acc_id] > test_lers[acc_id]: min_test_lers[acc_id] = test_lers[acc_id] if acc_id not in min_dev_lers or ( min_dev_lers[acc_id] > dev_lers[acc_id]): min_dev_lers[acc_id] = dev_lers[acc_id] min_dev_test_lers[acc_id] = test_lers[acc_id] save(hparams, sess, "best_%d" % acc_id) tqdm.write( "dev: %.2f, test: %.2f, acc: %.2f" % (dev_lers[acc_id] * 100, test_lers[acc_id] * 100, min_test_lers[acc_id] * 100)) for (err_id, lers) in [("dev", dev_lers), ("test", test_lers), ("min_test", min_dev_test_lers) ]: if lers is not None and len(lers) > 0: writer.add_summary( tf.Summary(value=[ tf.Summary.Value( simple_value=lers[acc_id], tag="%s_error_rate_%d" % (err_id, acc_id)) ]), trainer.processed_inputs_count) last_eval_pos = trainer.global_step loss, summary = trainer.train(sess) # return if trainer.epoch > last_epoch: # reset epoch pbar = reset_pbar() last_epoch = trainer.epoch writer.add_summary(summary, trainer.processed_inputs_count) pbar.update(1) if not argval( "simulated" ) and trainer.global_step - last_save_step >= FLAGS.save_steps: save(hparams, sess, "epoch%d" % trainer.epoch) last_save_step = trainer.global_step if trainer.epoch > hparams.max_epoch_num: break # reduce batch size with long input if hparams.batch_size_decay: if trainer.decay_batch_size( trainer.epoch_exact - trainer.epoch, sess): pbar = reset_pbar() # update postfix pbar_pf = {} for acc_id in test_lers: if dev_lers is not None: pbar_pf["min_dev" + str(acc_id)] = "%2.2f" % ( min_dev_test_lers[acc_id] * 100) pbar_pf["min_test" + str(acc_id)] = "%2.2f" % (min_test_lers[acc_id] * 100) pbar_pf["test" + str(acc_id)] = "%2.2f" % (test_lers[acc_id] * 100) if dev_lers is not None: pbar_pf["dev" + str(acc_id)] = "%2.2f" % (dev_lers[acc_id] * 100) pbar_pf['cost'] = "%.3f" % (loss) pbar.set_postfix(pbar_pf)
def main(_): # Import data mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True, fake_data=FLAGS.fake_data) def feed_dict(train): if train or FLAGS.fake_data: xs, ys = mnist.train.next_batch(FLAGS.train_batch_size, fake_data=FLAGS.fake_data) else: xs, ys = mnist.test.images, mnist.test.labels return {x: xs, y_: ys} sess = tf.InteractiveSession() # Create the MNIST neural network graph. # Input placeholders. with tf.name_scope("input"): x = tf.placeholder( tf.float32, [None, IMAGE_SIZE * IMAGE_SIZE], name="x-input") y_ = tf.placeholder(tf.float32, [None, NUM_LABELS], name="y-input") def weight_variable(shape): """Create a weight variable with appropriate initialization.""" initial = tf.truncated_normal(shape, stddev=0.1, seed=RAND_SEED) return tf.Variable(initial) def bias_variable(shape): """Create a bias variable with appropriate initialization.""" initial = tf.constant(0.1, shape=shape) return tf.Variable(initial) def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): """Reusable code for making a simple neural net layer.""" # Adding a name scope ensures logical grouping of the layers in the graph. with tf.name_scope(layer_name): # This Variable will hold the state of the weights for the layer with tf.name_scope("weights"): weights = weight_variable([input_dim, output_dim]) with tf.name_scope("biases"): biases = bias_variable([output_dim]) with tf.name_scope("Wx_plus_b"): preactivate = tf.matmul(input_tensor, weights) + biases activations = act(preactivate) return activations hidden = nn_layer(x, IMAGE_SIZE**2, HIDDEN_SIZE, "hidden") logits = nn_layer(hidden, HIDDEN_SIZE, NUM_LABELS, "output", tf.identity) y = tf.nn.softmax(logits) with tf.name_scope("cross_entropy"): # The following line is the culprit of the bad numerical values that appear # during training of this graph. Log of zero gives inf, which is first seen # in the intermediate tensor "cross_entropy/Log:0" during the 4th run() # call. A multiplication of the inf values with zeros leads to nans, # which is first in "cross_entropy/mul:0". # # You can use the built-in, numerically-stable implementation to fix this # issue: # diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=logits) diff = -(y_ * tf.log(y)) with tf.name_scope("total"): cross_entropy = tf.reduce_mean(diff) with tf.name_scope("train"): train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize( cross_entropy) with tf.name_scope("accuracy"): with tf.name_scope("correct_prediction"): correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) with tf.name_scope("accuracy"): accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) sess.run(tf.global_variables_initializer()) if FLAGS.debug and FLAGS.tensorboard_debug_address: raise ValueError( "The --debug and --tensorboard_debug_address flags are mutually " "exclusive.") if FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=FLAGS.ui_type) elif FLAGS.tensorboard_debug_address: sess = tf_debug.TensorBoardDebugWrapperSession( sess, FLAGS.tensorboard_debug_address) # Add this point, sess is a debug wrapper around the actual Session if # FLAGS.debug is true. In that case, calling run() will launch the CLI. for i in range(FLAGS.max_steps): acc = sess.run(accuracy, feed_dict=feed_dict(False)) print("Accuracy at step %d: %s" % (i, acc)) sess.run(train_step, feed_dict=feed_dict(True))
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() if self.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter('filter_E_values', filter_E_values) # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) self.main_e = self.create_e_network(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic(target_batch_tf, net_type='target', **self.__dict__) self.target_e = self.create_e_network(target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions # self.XX.pi_tf is the action policy we ll use for exploration (TO CONFIRM) # self.XX.Q_pi_tf is the Q network used to train this policy # self.XX.Q_tf # loss function for the E values target_e_tf = self.gamma_e * self.target_e.E_tf self.E_loss_tf = tf.reduce_mean( tf.square(tf.stop_gradient(target_e_tf) - self.main_e.E_tf)) # loss function for Q target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value( batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) # loss function for Q_tf where we exclude target_tf from the gradient computation: self.Q_loss_tf = tf.reduce_mean( tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) # loss function for the action policy is that of the main Q_pi network: self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) # add L2 regularization term from the policy itself: self.pi_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) # define the gradients of the Q_loss and pi_loss wrt to their variables respectively Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) E_grads_tf = tf.gradients(self.E_loss_tf, self._vars('main/E')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) assert len(self._vars('main/E')) == len(E_grads_tf) # zip the gradients together with their respective variables self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.E_grads_vars_tf = zip(E_grads_tf, self._vars('main/E')) # flattened gradients and variables self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) self.E_grad_tf = flatten_grads(grads=E_grads_tf, var_list=self._vars('main/E')) # optimizers (using MPI for parallel updates of the network (TO CONFIRM)) self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.E_adam = MpiAdam(self._vars('main/E'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging used for the update of the target networks in both pi and Q nets self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') self.e_main_vars = self._vars('main/E') self.e_target_vars = self._vars('target/E') # operation to initialize the target nets at the main nets'values self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.init_e_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.e_target_vars, self.e_main_vars))) # operation to update the target nets from the main nets using polyak averaging self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) self.update_e_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.e_target_vars, self.e_main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() # CHECK WHAT THIS DOES ???? self._init_target_net()
import numpy as np import tensorflow as tf from keras.callbacks import TensorBoard from keras.layers import Input, Dense from keras.models import Model # Just disables the warning, doesn't enable AVX/FMA import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' import keras.backend as K from tensorflow.python import debug as tf_debug K.set_session(tf_debug.LocalCLIDebugWrapperSession(K.get_session())) net_in = Input(shape=(3, )) net_out = Dense(1)(net_in) model = Model(net_in, net_out) model.compile(loss='mse', optimizer='sgd') for batch_no in range(3): X_train, Y_train = np.random.rand(32, 3), np.random.rand(32, 1) logs = model.train_on_batch(X_train, Y_train) print(logs) if batch_no % 1 == 0: X_val, Y_val = np.random.rand(32, 3), np.random.rand(32, 1) logs = model.train_on_batch(X_val, Y_val)
def train(): env, inc_gs, policy_net, value_net = _setup() writer = tf.summary.FileWriter(os.path.join(CONFIG.dpath_model, 'train')) saver = tf.train.Saver(keep_checkpoint_every_n_hours=2.0, max_to_keep=10) if CONFIG.histogram_parameters: _add_histograms() with tf.Session() as sess: writer.add_graph(sess.graph) if CONFIG.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) tf.global_variables_initializer().run() if CONFIG.load_params_torch: sess.run(policy_net.assign_ops) latest_checkpoint = tf.train.latest_checkpoint(CONFIG.dpath_checkpoint) if latest_checkpoint: print("Loading model checkpoint: {}".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) env.set_session(sess) for batch_idx in range(CONFIG.n_iter): paths = [] for ep_idx in range(CONFIG.batch_size): obs = env.reset() observations, actions, rewards = [], [], [] render_this_episode = (not paths and (batch_idx % 10 == 0) and CONFIG.render) steps = 0 recurrent_state = get_recurrent_zero_state(obs, policy_net) for _ in itertools.count(): if render_this_episode: env.render() time.sleep(0.05) observations.append(obs) feed = { policy_net.observations: obs[None, None], policy_net.state_placeholders[0]: recurrent_state[0], policy_net.state_placeholders[1]: recurrent_state[1] } sampled_action, recurrent_state, _ = sess.run( [policy_net.sampled, policy_net.new_state, inc_gs], feed_dict=feed) actions.append(sampled_action[0][0]) obs, rew, done, _ = env.step(sampled_action) rewards.append(rew) steps += 1 if done: break path = { "observation": np.array(observations), "reward": np.array(rewards), "action": np.array(actions), "last_observation": obs, } paths.append(path) # Build arrays for observation, action for the policy gradient update by concatenating # across paths observations = np.array([path["observation"] for path in paths]) actions = np.array([path["action"] for path in paths]) qs, final_obs = [], [] # Multiply each step in path by appropriate discount for path in paths: n = path["reward"].shape[0] discounts = CONFIG.discount**np.arange(n) discounted_rew_seq = discounts * path["reward"] q_path = np.cumsum(discounted_rew_seq[::-1])[::-1] / discounts qs.append(q_path) final_obs.append(path['last_observation']) qs = np.array(qs) final_obs = np.array(final_obs) observations_w_final = np.concatenate( [observations, final_obs[:, None]], axis=1) feed = {value_net.observations: observations_w_final} vals = sess.run(value_net.predicted_values, feed) vals, vals_final = np.split(vals, [-1], axis=1) n_timesteps = qs.shape[1] vals_final_disc = vals_final * (np.arange(n_timesteps, 0, -1)** CONFIG.discount) qs = qs + vals_final_disc vals_norm = _normalize(vals, qs.mean(), qs.std()) advs = qs - vals_norm if not CONFIG.dont_normalize_advantages: advs = _normalize(advs) recurrent_state = get_recurrent_zero_state(observations, policy_net) feed = { policy_net.observations: observations, policy_net.actions: actions, policy_net.targets: advs, value_net.observations: observations, value_net.targets: _normalize(qs), policy_net.state_placeholders[0]: recurrent_state[0], policy_net.state_placeholders[1]: recurrent_state[1] } summaries = tf.summary.merge_all(tf.GraphKeys.SUMMARIES) _, _, summaries_all = sess.run([ policy_net.update_op, value_net.update_op, summaries, ], feed_dict=feed) writer.add_summary(summaries_all, global_step=batch_idx + 1) add_path_summaries(batch_idx, paths, writer) writer.flush() saver.save(sess, os.path.join(CONFIG.dpath_model, 'checkpoints', 'model'))
def main(argv=None): local = "--local" in argv example_batch, label_batch = input_pipeline(False, local) # 5% validation data. Figure out a better way to halt (or re-use?). Try num_epochs=None. v_example, v_label = input_pipeline(True, local, batch_size=BATCH_SIZE * 20) with tf.variable_scope("model") as scope: readout, keep_prob = model.get_transcription_model(example_batch) # Use the same weights and biases for the validation model. scope.reuse_variables() v_readout, v_keep_prob = model.get_transcription_model(v_example) # Don't use softmax since the outputs aren't mutually exclusive. with tf.name_scope('train'): loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=label_batch, logits=readout)) v_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=v_label, logits=v_readout)) tf.summary.scalar('loss', loss) tf.summary.scalar('v_loss', v_loss) training_step = tf.train.AdamOptimizer(1e-4).minimize(loss) def interpretation(logit, cutoff): # Returns a tensor that represents how we are interpreting the output # of our model. # |logit| is a [1, 88] tensor representing the likelihood that each # note is present in a given sample. # |cutoff| is the probability at which we will consider a note to be # present. with tf.name_scope('interpretation'): return tf.cast(tf.greater(logit, cutoff), tf.float32) # For now, analyze successfulness as number of predictions it gets # exactly right. We can revisit this later. with tf.name_scope('accuracy'): with tf.name_scope('predictions'): sigmoid = tf.sigmoid(v_readout) # For each note, did it get the right prediction? # Should be >90% if it just predicts all 0s. correct_predictions7 = tf.cast( tf.equal(interpretation(sigmoid, 0.7), v_label), tf.float32) correct_predictions5 = tf.cast( tf.equal(interpretation(sigmoid, 0.5), v_label), tf.float32) correct_predictions9 = tf.cast( tf.equal(interpretation(sigmoid, 0.9), v_label), tf.float32) # Did it get the right prediction for every note? Look if the # min value is 1. correct_prediction7 = tf.equal( tf.reduce_min(correct_predictions7, 1), 1) correct_prediction5 = tf.equal( tf.reduce_min(correct_predictions5, 1), 1) correct_prediction9 = tf.equal( tf.reduce_min(correct_predictions9, 1), 1) accuracy7 = tf.reduce_mean(tf.cast(correct_prediction7, tf.float32)) accuracy5 = tf.reduce_mean(tf.cast(correct_prediction5, tf.float32)) accuracy9 = tf.reduce_mean(tf.cast(correct_prediction9, tf.float32)) tf.summary.scalar('accuracy7', accuracy7) tf.summary.scalar('accuracy5', accuracy5) tf.summary.scalar('accuracy9', accuracy9) summary = tf.summary.merge_all() with tf.Session() as sess: tb_path = '/tmp/tensorboard/' if local else 'gs://audionn-data/tensorboard/' summary_writer = tf.summary.FileWriter(tb_path, sess.graph) if "--debug" in argv: sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) coord = tf.train.Coordinator() # Start imperative steps. threads = tf.train.start_queue_runners(coord=coord) num_steps = _get_training_length( _training_data_path(local)) / BATCH_SIZE print "BEGINNING TRANING..." step = 0 while step < num_steps and not coord.should_stop(): step += 1 print step sess.run([training_step], feed_dict={keep_prob: 0.5}) if step % 1000 == 0: l, s, a5, a7, a9 = sess.run( [loss, summary, accuracy5, accuracy7, accuracy9], feed_dict={ keep_prob: 0.5, v_keep_prob: 1.0 }) print('Step: %d Loss: %f\n Accuracies: %d, %d, %d' % (step, l, a5, a7, a9)) summary_writer.add_summary(s, step) print("DONE TRAINING") coord.request_stop() coord.join(threads)
def __init__(self, policy, args): network_data_format = 'NHWC' if args.nhwc else 'NCHW' value_loss_weight = args.value_loss_weight entropy_weight = args.entropy_weight learning_rate = args.lr max_to_keep = args.max_to_keep nenvs = args.envs nsteps = args.steps_per_batch res = args.res checkpoint_path = args.ckpt_path summary_writer = args.summary_writer debug = args.debug debug_tb_adress = args.tensorboard_debug_address print('\n### A2C Agent #######') print(f'# policy = {policy}') print(f'# network_data_format = {network_data_format}') print(f'# value_loss_weight = {value_loss_weight}') print(f'# entropy_weight = {entropy_weight}') print(f'# learning_rate = {learning_rate}') print(f'# max_to_keep = {max_to_keep}') print(f'# nenvs = {nenvs}') print(f'# nsteps = {nsteps}') print(f'# res = {res}') print(f'# checkpoint_path = {checkpoint_path}') print(f'# debug = {debug}') print(f'# debug_tb_adress = {debug_tb_adress}') print('######################\n') max_gradient_norm = 1.0 tf.reset_default_graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) if debug and debug_tb_adress: raise ValueError( "The --debug and --tensorboard_debug_address flags are mutually " "exclusive.") if debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) elif debug_tb_adress: sess = tf_debug.TensorBoardDebugWrapperSession( sess, debug_tb_adress) nbatch = nenvs * nsteps ch = get_input_channels() ob_space = { 'screen': [None, res, res, ch['screen']], 'minimap': [None, res, res, ch['minimap']], 'flat': [None, ch['flat']], 'available_actions': [None, ch['available_actions']] } step_model = policy(sess, ob_space=ob_space, nbatch=nenvs, nsteps=1, reuse=None, data_format=network_data_format) train_model = policy(sess, ob_space=ob_space, nbatch=nbatch, nsteps=nsteps, reuse=True, data_format=network_data_format) # Define placeholders fn_id = tf.placeholder(tf.int32, [None], name='fn_id') arg_ids = { k: tf.placeholder(tf.int32, [None], name='arg_{}_id'.format(k.id)) for k in train_model.policy[1].keys() } ACTIONS = (fn_id, arg_ids) ADVS = tf.placeholder(tf.float32, [None], name='adv') RETURNS = tf.placeholder(tf.float32, [None], name='returns') # Define Loss log_probs = compute_policy_log_probs(train_model.AV_ACTS, train_model.policy, ACTIONS) policy_loss = -tf.reduce_mean(ADVS * log_probs) value_loss = tf.reduce_mean( tf.square(RETURNS - train_model.value) / 2.) entropy = compute_policy_entropy(train_model.AV_ACTS, train_model.policy, ACTIONS) loss = policy_loss + value_loss * value_loss_weight - entropy * entropy_weight # Define Optimizer global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(learning_rate, global_step, 10000, 0.94) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.99, epsilon=1e-5) train_op = layers.optimize_loss(loss=loss, global_step=global_step, optimizer=optimizer, clip_gradients=max_gradient_norm, learning_rate=None, name="train_op") tf.summary.scalar('entropy', entropy) tf.summary.scalar('loss', loss) tf.summary.scalar('loss/policy', policy_loss) tf.summary.scalar('loss/value', value_loss) tf.summary.scalar('rl/value', tf.reduce_mean(train_model.value)) tf.summary.scalar('rl/returns', tf.reduce_mean(RETURNS)) tf.summary.scalar('rl/advs', tf.reduce_mean(ADVS)) summary_writer.add_graph(sess.graph) variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) saver = tf.train.Saver(variables, max_to_keep=max_to_keep) train_summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) train_summary_op = tf.summary.merge(train_summaries) # Load checkpoints if exist if os.path.exists(checkpoint_path): ckpt = tf.train.get_checkpoint_state(checkpoint_path) self.train_step = int(ckpt.model_checkpoint_path.split('-')[-1]) saver.restore(sess, ckpt.model_checkpoint_path) print("Loaded agent at episode {} (step {})".format( self.train_step // nsteps, self.train_step)) else: self.train_step = 0 sess.run(tf.variables_initializer(variables)) def train(obs, states, actions, returns, advs, summary=False): """ Args: obs: dict of preprocessed observation arrays, with num_batch elements in the first dimensions. actions: see `compute_total_log_probs`. returns: array of shape [num_batch]. advs: array of shape [num_batch]. summary: Whether to return a summary. Returns: summary: (agent_step, loss, Summary) or None. """ feed_dict = { train_model.SCREEN: obs['screen'], train_model.MINIMAP: obs['minimap'], train_model.FLAT: obs['flat'], train_model.AV_ACTS: obs['available_actions'], RETURNS: returns, ADVS: advs, ACTIONS[0]: actions[0] } feed_dict.update({v: actions[1][k] for k, v in ACTIONS[1].items()}) if states is not None: # For recurrent polices feed_dict.update({train_model.STATES: states}) agent_step = self.train_step self.train_step += 1 if summary: _, _step, _loss, _summary = sess.run( [train_op, global_step, loss, train_summary_op], feed_dict=feed_dict) return _step, _loss, _summary else: sess.run([train_op, loss], feed_dict=feed_dict) def save(path, step=None): os.makedirs(path, exist_ok=True) print("Saving agent to %s, step %d" % (path, sess.run(global_step))) ckpt_path = os.path.join(path, 'model.ckpt') saver.save(sess, ckpt_path, global_step=global_step) def get_global_step(): return sess.run(global_step) self.train = train self.step = step_model.step self.get_value = step_model.get_value self.save = save self.initial_state = step_model.initial_state self.get_global_step = get_global_step
def train(args): model = NTMOneShotLearningModel(args) data_loader = OmniglotDataLoader( data_dir= "/Users/xavier.qiu/Documents/ricecourse/comp590Research/data/omniglot/images_background/", image_size=(args.image_width, args.image_height), n_train_classses=args.n_train_classes, n_test_classes=args.n_test_classes) with tf.Session() as sess: if args.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) if args.restore_training: saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(args.save_dir + '/' + args.model) saver.restore(sess, ckpt.model_checkpoint_path) else: saver = tf.train.Saver(tf.global_variables()) tf.global_variables_initializer().run() train_writer = tf.summary.FileWriter( args.tensorboard_dir + '/' + args.model, sess.graph) print(args) print("1st\t2nd\t3rd\t4th\t5th\t6th\t7th\t8th\t9th\t10th\tbatch\tloss") for b in range(args.num_epoches): # Test if b % 100 == 0: x_image, x_label, y = data_loader.fetch_batch( args.n_classes, args.batch_size, args.seq_length, type='test', augment=args.augment, label_type=args.label_type) feed_dict = { model.x_image: x_image, model.x_label: x_label, model.y: y } output, learning_loss = sess.run( [model.o, model.learning_loss], feed_dict=feed_dict) merged_summary = sess.run(model.learning_loss_summary, feed_dict=feed_dict) train_writer.add_summary(merged_summary, b) # state_list = sess.run(model.state_list, feed_dict=feed_dict) # For debugging # with open('state_long.txt', 'w') as f: # print(state_list, file=f) accuracy = test_f(args, y, output) for accu in accuracy: print('%.4f' % accu, end='\t') print('%d\t%.4f' % (b, learning_loss)) # Save model if b % 5000 == 0 and b > 0: saver.save(sess, args.save_dir + '/' + args.model + '/model.tfmodel', global_step=b) # Train x_image, x_label, y = data_loader.fetch_batch( args.n_classes, args.batch_size, args.seq_length, type='train', augment=args.augment, label_type=args.label_type) feed_dict = { model.x_image: x_image, model.x_label: x_label, model.y: y } sess.run(model.train_op, feed_dict=feed_dict)
def wrapper_debug(sess): sess = tf_debug.LocalCLIDebugWrapperSession( sess, thread_name_filter="MainThread$") sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) return sess
def train(self): agents = self.agents config = self.config model = self.model j = self.j self.j += 1 print("===> iteration", self.j) iter_start = time.time() weights = ray.put(model.get_weights()) [a.load_weights.remote(weights) for a in agents] trajectory, total_reward, traj_len_mean = collect_samples( agents, config) print("total reward is ", total_reward) print("trajectory length mean is ", traj_len_mean) print("timesteps:", trajectory["dones"].shape[0]) if self.file_writer: traj_stats = tf.Summary(value=[ tf.Summary.Value( tag="ppo/rollouts/mean_reward", simple_value=total_reward), tf.Summary.Value( tag="ppo/rollouts/traj_len_mean", simple_value=traj_len_mean)]) self.file_writer.add_summary(traj_stats, self.global_step) self.global_step += 1 def standardized(value): # Divide by the maximum of value.std() and 1e-4 # to guard against the case where all values are equal return (value - value.mean()) / max(1e-4, value.std()) if config["use_gae"]: trajectory["advantages"] = standardized(trajectory["advantages"]) else: trajectory["returns"] = standardized(trajectory["returns"]) rollouts_end = time.time() print("Computing policy (iterations=" + str(config["num_sgd_iter"]) + ", stepsize=" + str(config["sgd_stepsize"]) + "):") names = [ "iter", "total loss", "policy loss", "vf loss", "kl", "entropy"] print(("{:>15}" * len(names)).format(*names)) trajectory = shuffle(trajectory) shuffle_end = time.time() tuples_per_device = model.load_data( trajectory, j == 0 and config["full_trace_data_load"]) load_end = time.time() rollouts_time = rollouts_end - iter_start shuffle_time = shuffle_end - rollouts_end load_time = load_end - shuffle_end sgd_time = 0 for i in range(config["num_sgd_iter"]): sgd_start = time.time() batch_index = 0 num_batches = ( int(tuples_per_device) // int(model.per_device_batch_size)) loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], [] permutation = np.random.permutation(num_batches) # Prepare to drop into the debugger if j == config["tf_debug_iteration"]: model.sess = tf_debug.LocalCLIDebugWrapperSession(model.sess) while batch_index < num_batches: full_trace = ( i == 0 and j == 0 and batch_index == config["full_trace_nth_sgd_batch"]) batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \ batch_entropy = model.run_sgd_minibatch( permutation[batch_index] * model.per_device_batch_size, self.kl_coeff, full_trace, self.file_writer) loss.append(batch_loss) policy_loss.append(batch_policy_loss) vf_loss.append(batch_vf_loss) kl.append(batch_kl) entropy.append(batch_entropy) batch_index += 1 loss = np.mean(loss) policy_loss = np.mean(policy_loss) vf_loss = np.mean(vf_loss) kl = np.mean(kl) entropy = np.mean(entropy) sgd_end = time.time() print( "{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format( i, loss, policy_loss, vf_loss, kl, entropy)) values = [] if i == config["num_sgd_iter"] - 1: metric_prefix = "ppo/sgd/final_iter/" values.append(tf.Summary.Value( tag=metric_prefix + "kl_coeff", simple_value=self.kl_coeff)) values.extend([ tf.Summary.Value( tag=metric_prefix + "mean_entropy", simple_value=entropy), tf.Summary.Value( tag=metric_prefix + "mean_loss", simple_value=loss), tf.Summary.Value( tag=metric_prefix + "mean_kl", simple_value=kl)]) if self.file_writer: sgd_stats = tf.Summary(value=values) self.file_writer.add_summary(sgd_stats, self.global_step) self.global_step += 1 sgd_time += sgd_end - sgd_start if kl > 2.0 * config["kl_target"]: self.kl_coeff *= 1.5 elif kl < 0.5 * config["kl_target"]: self.kl_coeff *= 0.5 info = { "kl_divergence": kl, "kl_coefficient": self.kl_coeff, "rollouts_time": rollouts_time, "shuffle_time": shuffle_time, "load_time": load_time, "sgd_time": sgd_time, "sample_throughput": len(trajectory["observations"]) / sgd_time } print("kl div:", kl) print("kl coeff:", self.kl_coeff) print("rollouts time:", rollouts_time) print("shuffle time:", shuffle_time) print("load time:", load_time) print("sgd time:", sgd_time) print("sgd examples/s:", len(trajectory["observations"]) / sgd_time) print("total time so far:", time.time() - self.start_time) result = TrainingResult( self.experiment_id.hex, j, total_reward, traj_len_mean, info) return result
def main(): args = get_args() import json from sys import stderr with open(args.arch_file, 'r') as fp: arch = json.load(fp) with open(args.par_file, 'r') as fp: par = json.load(fp) # args consistency checks if args.num_global_cond is None and 'n_gc_category' not in arch: print( 'Error: must provide n_gc_category in ARCH_FILE, or --num-global-cond', file=stderr) exit(1) if args.tf_eager and args.tf_debug: print('Error: --tf-debug and --tf-eager cannot both be set', file=stderr) exit(1) import tmodel import data import tensorflow as tf import contextlib from tensorflow.python import debug as tf_debug from tensorflow.python.client import timeline import tests from os.path import join as path_join config = tf.ConfigProto() #config.gpu_options.per_process_gpu_memory_fraction = 0.2 # config.log_device_placement = True #config.gpu_options.allow_growth = True #config.allow_soft_placement = True if args.tf_eager: tf.enable_eager_execution(config=config) # Overrides if args.batch_size is not None: par['batch_sz'] = args.batch_size if args.slice_size is not None: par['slice_sz'] = args.slice_size if args.l2_factor is not None: par['l2_factor'] = args.l2_factor if args.learning_rate is not None: par['learning_rate'] = args.learning_rate if args.tf_eager: sess = None else: sess = tf.Session(config=config) print('Created tf.Session.', file=stderr) from functools import reduce mel_hop_sz = reduce(lambda x, y: x * y, arch['lc_upsample']) dset_ckpt = '{}.dset'.format(args.ckpt_path) dset = data.MaskedSliceWav(sess, args.sam_file, par['sample_rate'], par['slice_sz'], par['prefetch_sz'], arch['n_lc_in'], mel_hop_sz, par['batch_sz'], par['n_keep_checkpoints'], dset_ckpt, args.resume_step or 0) dset.init_sample_catalog() if args.num_global_cond is not None: if args.num_global_cond < dset.get_max_id(): print( 'Error: --num-global-cond must be >= {}, the highest ID in the dataset.' .format(dset.get_max_id()), file=stderr) exit(1) else: arch['n_gc_category'] = args.num_global_cond # tfdbg can't run if this is before dset.wav_dataset call if args.tf_debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) net_ckpt = '{}.net'.format(args.ckpt_path) net = tmodel.WaveNetTrain(**arch, batch_sz=par['batch_sz'], l2_factor=par['l2_factor'], add_summary=par['add_summary'], n_keep_checkpoints=par['n_keep_checkpoints'], ckpt_path=net_ckpt, resume_step=args.resume_step or 0, n_valid_total=par['n_valid_total'], sess=sess, print_interval=args.progress_interval) # this is where dset is annoyingly dependent on net dset.set_receptive_field_size(net.get_recep_field_sz()) dset.build() dset.init_vars() dev_string = '/cpu:0' if args.cpu_only else '/gpu:0' with contextlib.ExitStack() as stack: if args.prof_dir is not None: ctx = tf.contrib.tfprof.ProfileContext(args.prof_dir) ctx_obj = stack.enter_context(ctx) #stack.enter_context(tf.device(dev_string)) optimizer = tf.train.AdamOptimizer(learning_rate=par['learning_rate']) # create the ops just once if not in eager mode if not args.tf_eager: file_read_count, *data_ops = dset.get_op() grads_and_vars_op, loss_op = net.build(*data_ops) print('Built graph.', file=stderr) apply_grads_op = optimizer.apply_gradients(grads_and_vars_op) sess.run(tf.global_variables_initializer()) print('Created gradients.', file=stderr) else: # must call this to create the variables itr = dset.get_itr() _, *data_ops = next(itr) _ = net.build(*data_ops) assert len(net.vars) > 0 net.init_vars() if args.resume_step: net.restore() dset.restore() print('Restored net and dset from checkpoint', file=stderr) summary_op = tf.summary.merge_all() if args.add_summary else None if summary_op is not None and args.tb_dir is None: print('Error: must provide --tb-dir argument if ' + 'there are summaries in the graph', file=stderr) exit(1) if args.tb_dir: fw = tf.summary.FileWriter(tb_dir, graph=sess.graph) make_flusher(fw) print('Starting training...', file=stderr) step = args.resume_step or 1 wav_itr = dset.get_itr() while step < args.max_steps: if args.tf_eager: file_read_count, wav_input, mel_input, id_mask = next(wav_itr) grads_and_vars, loss = net.build(wav_input, mel_input, id_mask) optimizer.apply_gradients(grads_and_vars) else: if step == 5: run_meta = tf.RunMetadata() run_opts = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE, output_partition_graphs=True) _, _ = sess.run([apply_grads_op, loss_op], options=run_opts, run_metadata=run_meta) with open('/tmp/run.txt', 'w') as out: out.write(str(run_meta)) if args.timeline_file is not None: fetched_timeline = timeline.Timeline( run_meta.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format( ) with open(args.timeline_file, 'w') as f: f.write(chrome_trace) _, loss = sess.run([apply_grads_op, loss_op]) if step % 1 == 0: if args.tf_eager and summary_op is not None: fw.add_summary(sess.run(summary_op), step) if step % args.save_interval == 0 and step != args.resume_step: net_save_path = net.save(step) dset_save_path = dset.save(step, file_read_count) print('Saved checkpoints to {} and {}'.format( net_save_path, dset_save_path), file=stderr) step += 1
def train(n_epochs, config, predict): encoder = "DAN" #"DAN" or "transformer" questions, answers, word2index = datas.load_dataset( dir="../datasets/amazonQA", file_filter="qa_Electronics.json.gz", num_words=VOCABULARY_SIZE) size = len(questions) # version 1 # with tf.variable_scope("word2vec"): # embeddings = tf.get_variable("embeddings", shape=[VOCABULARY_SIZE + 1, EMBEDDING_SIZE], # initializer=tf.initializers.random_uniform(-0.25, 0.25), dtype=tf.float32, # trainable=True) # version 2 with tf.variable_scope("word2vec"): pretrained_embs = tf.get_variable(name="zeros", initializer=tf.constant_initializer( np.zeros([1, EMBEDDING_SIZE]), dtype=tf.float32), shape=[1, EMBEDDING_SIZE], trainable=False) train_embeddings = tf.get_variable( name="trainable", shape=[VOCABULARY_SIZE, EMBEDDING_SIZE], initializer=tf.random_uniform_initializer(-0.25, 0.25), dtype=tf.float32, trainable=True) embeddings = tf.concat([pretrained_embs, train_embeddings], axis=0) fractions = [0.8, 0.2, 0.0] # fractions = [0.1, 0.02, 0.1] l1 = int(fractions[0] * size) l2 = int((fractions[0] + fractions[1]) * size) train_questions, test_questions, validate_questions = questions[:l1], questions[ l1:l2], questions[l2:] train_answers, test_answers, validate_answers = answers[:l1], answers[ l1:l2], answers[l2:] pred = model_predication_op(embeddings) loss, training_op = model_train_op(pred) # global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') testing_op = pair_accuracy_op(pred) testing_op_top5 = pair_top_k_accuracy_op(pred, 5) testing_op_top10 = pair_top_k_accuracy_op(pred, 10) chkpoint_saver = tf.train.Saver(max_to_keep=50, ) if not predict: with tf.Session(config=config) as sess: if DEBUG_MODE: from tensorflow.python import debug as tf_debug sess = tf_debug.LocalCLIDebugWrapperSession(sess) def my_filter_callable(datum, tensor): # A filter that detects zero-valued scalars. return len(tensor.shape) == 0 and tensor == 0.0 sess.add_tensor_filter('my_filter', my_filter_callable) tf.global_variables_initializer().run() train_writer = tf.summary.FileWriter('./logs/train', sess.graph) for epoch in xrange(n_epochs): for i in xrange(len(train_questions) // K): questions_batch, answers_batch = datas.get_sentence_batch( K, train_questions, train_answers) questions_batch, q_lens = datas.pad_batch(questions_batch) answers_batch, a_lens = datas.pad_batch(answers_batch) sess.run(training_op, feed_dict={ inputs: questions_batch, responses: answers_batch, inputs_lens: q_lens, responses_lens: a_lens }) if (i + 1) % 50 == 0: chkpoint_saver.save(sess, '.logs/train/qa_model', tf.train.get_global_step().eval()) accus_test, accus_test1, accus_test2 = [], [], [] for i in xrange(len(test_questions) // K): questions_batch, answers_batch = datas.get_sentence_batch( K, test_questions, test_answers) questions_batch, q_lens = datas.pad_batch(questions_batch) answers_batch, a_lens = datas.pad_batch(answers_batch) acc_test, acc_test1, acc_test2 = sess.run( [testing_op, testing_op_top5, testing_op_top10], feed_dict={ inputs: questions_batch, responses: answers_batch, inputs_lens: q_lens, responses_lens: a_lens }) accus_test.append(acc_test) accus_test1.append(acc_test1) accus_test2.append(acc_test2) # print(i, "Test accuracy:", acc_test) tf.summary.scalar("top-1-accuracy", tf.reduce_mean(accus_test)) tf.summary.scalar("top-5-accuracy", tf.reduce_mean(accus_test1)) tf.summary.scalar("top-10-accuracy", tf.reduce_mean(accus_test2)) print("[Train] accuracy:", sum(accus_test) / len(accus_test)) merge = tf.summary.merge_all() train_writer.add_summary(sess.run(merge), tf.train.get_global_step().eval()) train_writer.close() with tf.variable_scope("word2vec", reuse=True): # embeddings = tf.get_variable("embeddings") print(sess.run(embeddings[:5, :])) # exit(0) Inputs = { "inputs": inputs, "responses": responses, "inputs_lens": inputs_lens, "responses_lens": responses_lens } Outputs = {"testing_op": testing_op} tf.saved_model.simple_save(sess, 'saved_model', Inputs, Outputs) # Load Model else: with tf.Session(config=config) as sess: if DEBUG_MODE: from tensorflow.python import debug as tf_debug sess = tf_debug.LocalCLIDebugWrapperSession(sess) tf.global_variables_initializer().run() tf.saved_model.loader.load( sess, [tag_constants.SERVING], 'saved_model', ) # graph = sess.graph # embeddings = graph.get_tensor_by_name('word2vec/embeddings:0') accus_test = [] for i in xrange(len(test_questions) // K): questions_batch, answers_batch = datas.get_sentence_batch( K, test_questions, test_answers) questions_batch, q_lens = datas.pad_batch(questions_batch) answers_batch, a_lens = datas.pad_batch(answers_batch) acc_test = sess.run(testing_op_top5, feed_dict={ inputs: questions_batch, responses: answers_batch, inputs_lens: q_lens, responses_lens: a_lens }) accus_test.append(acc_test) print("[Evaluate] Top-5 accuracy:", sum(accus_test) / len(accus_test))
def _train(self): agents = self.remote_evaluators config = self.config model = self.local_evaluator if (config["num_workers"] * config["min_steps_per_task"] > config["timesteps_per_batch"]): print( "WARNING: num_workers * min_steps_per_task > " "timesteps_per_batch. This means that the output of some " "tasks will be wasted. Consider decreasing " "min_steps_per_task or increasing timesteps_per_batch.") print("===> iteration", self.iteration) iter_start = time.time() weights = ray.put(model.get_weights()) [a.set_weights.remote(weights) for a in agents] samples = collect_samples(agents, config, self.local_evaluator) def standardized(value): # Divide by the maximum of value.std() and 1e-4 # to guard against the case where all values are equal return (value - value.mean()) / max(1e-4, value.std()) samples.data["advantages"] = standardized(samples["advantages"]) rollouts_end = time.time() print("Computing policy (iterations=" + str(config["num_sgd_iter"]) + ", stepsize=" + str(config["sgd_stepsize"]) + "):") names = [ "iter", "total loss", "policy loss", "vf loss", "kl", "entropy"] print(("{:>15}" * len(names)).format(*names)) samples.shuffle() shuffle_end = time.time() tuples_per_device = model.load_data( samples, self.iteration == 0 and config["full_trace_data_load"]) load_end = time.time() rollouts_time = rollouts_end - iter_start shuffle_time = shuffle_end - rollouts_end load_time = load_end - shuffle_end sgd_time = 0 for i in range(config["num_sgd_iter"]): sgd_start = time.time() batch_index = 0 num_batches = ( int(tuples_per_device) // int(model.per_device_batch_size)) loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], [] permutation = np.random.permutation(num_batches) # Prepare to drop into the debugger if self.iteration == config["tf_debug_iteration"]: model.sess = tf_debug.LocalCLIDebugWrapperSession(model.sess) while batch_index < num_batches: full_trace = ( i == 0 and self.iteration == 0 and batch_index == config["full_trace_nth_sgd_batch"]) batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \ batch_entropy = model.run_sgd_minibatch( permutation[batch_index] * model.per_device_batch_size, self.kl_coeff, full_trace, self.file_writer) loss.append(batch_loss) policy_loss.append(batch_policy_loss) vf_loss.append(batch_vf_loss) kl.append(batch_kl) entropy.append(batch_entropy) batch_index += 1 loss = np.mean(loss) policy_loss = np.mean(policy_loss) vf_loss = np.mean(vf_loss) kl = np.mean(kl) entropy = np.mean(entropy) sgd_end = time.time() print( "{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format( i, loss, policy_loss, vf_loss, kl, entropy)) values = [] if i == config["num_sgd_iter"] - 1: metric_prefix = "ppo/sgd/final_iter/" values.append(tf.Summary.Value( tag=metric_prefix + "kl_coeff", simple_value=self.kl_coeff)) values.extend([ tf.Summary.Value( tag=metric_prefix + "mean_entropy", simple_value=entropy), tf.Summary.Value( tag=metric_prefix + "mean_loss", simple_value=loss), tf.Summary.Value( tag=metric_prefix + "mean_kl", simple_value=kl)]) if self.file_writer: sgd_stats = tf.Summary(value=values) self.file_writer.add_summary(sgd_stats, self.global_step) self.global_step += 1 sgd_time += sgd_end - sgd_start if kl > 2.0 * config["kl_target"]: self.kl_coeff *= 1.5 elif kl < 0.5 * config["kl_target"]: self.kl_coeff *= 0.5 info = { "kl_divergence": kl, "kl_coefficient": self.kl_coeff, "rollouts_time": rollouts_time, "shuffle_time": shuffle_time, "load_time": load_time, "sgd_time": sgd_time, "sample_throughput": len(samples["observations"]) / sgd_time } FilterManager.synchronize( self.local_evaluator.filters, self.remote_evaluators) res = self._fetch_metrics_from_remote_evaluators() res = res._replace(info=info) return res
def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd() # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams() # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: self.loadEmbedding(self.sess) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format(self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program")
def build(count_dim, sf, params, mod_args, debug): # Model loss calculations def gaussian_reconstruction_loss(inputs, outputs, output_dim, params): if params['reconstruction_loss'] == 'mse': reconstruction_loss = mse(inputs, outputs) elif params['reconstruction_loss'] == 'binary_crossentropy': reconstruction_loss = binary_crossentropy(inputs, outputs) if debug: reconstruction_loss = K.print_tensor( reconstruction_loss, "\ngaussian reconstruction_loss") reconstruction_loss *= output_dim if debug: reconstruction_loss = K.print_tensor( reconstruction_loss, "\ngaussian reconstruction_loss scaled up") return reconstruction_loss def zinb_reconstruction_loss(y, mu, theta, pi, output_dim, params, debug): eps = 1e-10 if debug: y = K.print_tensor(y, "\ny:") mu = K.print_tensor(mu, "\nmu:") theta = K.print_tensor(theta, "\ntheta:") pi = K.print_tensor(pi, "\npi:") t1 = tf.lgamma(theta + eps) + tf.lgamma(y + 1.0) - tf.lgamma(y + theta + eps) t2 = (theta + y) * tf.log(1.0 + (mu / (theta + eps))) + ( y * (tf.log(theta + eps) - tf.log(mu + eps))) if debug: t1 = K.print_tensor(t1, "\nt1:") t2 = K.print_tensor(t2, "\nt2:") nb_case = t1 + t2 - tf.log(1.0 - pi + eps) if debug: nb_case = K.print_tensor(nb_case, "\nnb_case:") zero_nb = tf.pow(theta / (theta + mu + eps), theta) if debug: zero_nb = K.print_tensor(zero_nb, "\nzero_nb:") zero_case = -tf.log(pi + ((1.0 - pi) * zero_nb) + eps) if debug: zero_case = K.print_tensor(zero_case, "\nzero_case:") result = tf.where(tf.less(y, 1e-8), zero_case, nb_case) ridge = params['ridge'] * tf.square(pi) end_result = result + ridge if debug: result = K.print_tensor(result, "\nresult:") ridge = K.print_tensor(ridge, "\nridge:") end_result = K.print_tensor(end_result, "\nend_result:") # https://github.com/keras-team/keras/blob/master/keras/losses.py #reconstruction_loss = K.mean(end_result, axis=-1) reconstruction_loss = K.sum(end_result, axis=-1) if debug: reconstruction_loss = K.print_tensor( reconstruction_loss, "\nzinb reconstruction_loss:") return reconstruction_loss def gaussian_kl_loss(z_mean, z_log_var, params, debug): kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var) kl_loss = K.sum(kl_loss, axis=-1) kl_loss *= -0.5 if debug: z_mean = K.print_tensor(z_mean, "\nz_mean") z_log_var = K.print_tensor(z_log_var, "\nz_log_var") kl_loss = K.print_tensor(kl_loss, "\nkl_loss") return kl_loss # https://stackoverflow.com/questions/41863814/kl-divergence-in-tensorflow # https://github.com/tensorflow/probability/blob/master/tensorflow_probability/python/distributions/kullback_leibler.py def gaussian_kl_divergence(m, v, params, debug): ds = tf.contrib.distributions p = ds.Normal(loc=m[0], scale=v[0]) q = ds.Normal(loc=m[1], scale=v[1]) kl = ds.kl_divergence(p, q) if debug: kl = K.print_tensor(kl, "\nkl_divergence") return kl def gaussian_loss(inputs, outputs, z_mean, z_log_var, sf, z_mean_sf, z_log_var_sf, sf_means, sf_log_vars, mod_args, output_dim, params, debug): if debug: kl_beta_weight = K.print_tensor(mod_args['kl_beta_weight'], "\nkl_beta_weight") else: kl_beta_weight = mod_args['kl_beta_weight'] reconstruction_loss = gaussian_reconstruction_loss( inputs, outputs, output_dim, params) if params['variational']: kl_loss = gaussian_kl_loss(z_mean, z_log_var, params, debug) total_loss = reconstruction_loss + kl_beta_weight * kl_loss else: total_loss = reconstruction_loss if debug: total_loss = K.print_tensor(total_loss, "\ngaussian total vae_loss") vae_loss = K.mean(total_loss) recon_loss = K.mean(reconstruction_loss) if debug: vae_loss = K.print_tensor(vae_loss, "\ngaussian vae_loss") recon_loss = K.print_tensor(recon_loss, "\ngaussian recon_loss") kl_sf = None if sf.usingDynamicSizeFactors(): m = [sf_means, z_mean_sf] v = [K.sqrt(K.exp(sf_log_vars)), K.sqrt(K.exp(z_log_var_sf))] kl_sf = gaussian_kl_divergence(m, v, params, debug) if debug: kl_sf = K.print_tensor(kl_sf, "\ngaussian kl_sf") kl_sf = K.mean(kl_sf) if debug: kl_sf = K.print_tensor(kl_sf, "\ngaussian mean kl_sf") return vae_loss, recon_loss, kl_sf def zinb_loss(inputs, mu, dispersion, pi, z_mean, z_log_var, sf, z_mean_sf, z_log_var_sf, sf_means, sf_log_vars, mod_args, output_dim, params, debug): if debug: kl_beta_weight = K.print_tensor(mod_args['kl_beta_weight'], "\nkl_beta_weight") else: kl_beta_weight = mod_args['kl_beta_weight'] reconstruction_loss = zinb_reconstruction_loss( inputs, mu, dispersion, pi, output_dim, params, debug) if params['variational']: kl_loss = gaussian_kl_loss(z_mean, z_log_var, params, debug) total_loss = reconstruction_loss + kl_beta_weight * kl_loss else: total_loss = reconstruction_loss if debug: total_loss = K.print_tensor(total_loss, "\nzinb total loss") vae_loss = K.mean(total_loss) recon_loss = K.mean(reconstruction_loss) if debug: vae_loss = K.print_tensor(vae_loss, "\nzinb vae_loss") recon_loss = K.print_tensor(recon_loss, "\nzinb recon_loss") kl_sf = None if sf.usingDynamicSizeFactors(): m = [sf_means, z_mean_sf] v = [K.sqrt(K.exp(sf_log_vars)), K.sqrt(K.exp(z_log_var_sf))] kl_sf = gaussian_kl_divergence(m, v, params, debug) if debug: kl_sf = K.print_tensor(kl_sf, "\nzinb kl_sf") kl_sf = K.mean(kl_sf) if debug: kl_sf = K.print_tensor(kl_sf, "\nzinb mean kl_sf") return vae_loss, recon_loss, kl_sf # Model construction def layers(x, params, dims, nm, encoding=True): assert params['layer_structure'] in ['simple', 'augmented'] if encoding and params['input_dropout'] > 1e-10: x = Dropout(params['input_dropout'], name=nm + '_input_dropout')(x) for i, d in enumerate(dims): if params['layer_structure'] == 'simple': x = Dense(d, activation=params['act'], kernel_initializer=params['init'], name=nm + '_' + str(i))(x) elif params['layer_structure'] == 'augmented': # kernel_regularizer ?? if encoding: r = l1_l2(params['l1_enc'], params['l2_enc']) else: r = l1_l2(params['l1'], params['l2']) x = Dense(d, activation=None, kernel_initializer=params['init'], kernel_regularizer=r, name=nm + '_' + str(i))(x) x = BatchNormalization( center=True, scale=False, momentum=params['batchnorm_momentum'], epsilon=params['batchnorm_epsilon'], name=nm + '_norm_' + str(i))(x) x = Activation(params['act'], name=nm + '_act_' + str(i))(x) avoid_last_drop = False if i == len( dims) - 1 and not params['last_decode_dropout']: avoid_last_drop = True #if encoding or (params ['decode_dropout'] and not avoid_last_drop): if not avoid_last_drop: x = Dropout(params['dropout'], name=nm + '_drop_' + str(i))(x) return x # This implements the reparameterization trick required in VAEs def sampling(args, operation): z_mean, z_log_var = args batch = K.shape(z_mean)[0] dim = K.int_shape(z_mean)[1] epsilon = K.random_normal(shape=(batch, dim), mean=0.0, stddev=1.0) if debug: z_mean = K.print_tensor(z_mean, "sampling" + operation + " z_mean") z_log_var = K.print_tensor( z_log_var, "sampling" + operation + " z_log_var") epsilon = K.print_tensor(epsilon, "sampling" + operation + " epsilon") latent_space = z_mean + K.exp(0.5 * z_log_var) * epsilon if debug: latent_space = K.print_tensor( latent_space, "sampling" + operation + " latent_space") return latent_space def build_encoder(original_dim, sf, params): z_mean, z_log_var, z_mean_sf, z_log_var_sf, z_sf = [ None, None, None, None, None ] inputs = Input(shape=(original_dim, ), name='count_input') x = layers(inputs, params, params['hidden_structure'], 'encoder', encoding=True) if sf.usingDynamicSizeFactors(): x_sf = layers(inputs, params, params['hidden_structure'], 'encoder_sf', encoding=False) z_mean_sf = Dense(1, name='z_mean_sf')(x_sf) z_log_var_sf = Dense(1, name='z_log_var_sf')(x_sf) z_sf = Lambda(sampling, output_shape=(1, ), arguments={'operation': '_sf'}, name='z_sf')([z_mean_sf, z_log_var_sf]) if params['variational']: z_mean = Dense(params['latent_dim'], name='z_mean')(x) z_log_var = Dense(params['latent_dim'], name='z_log_var')(x) z = Lambda(sampling, output_shape=(params['latent_dim'], ), arguments={'operation': ''}, name='z')([z_mean, z_log_var]) if sf.usingDynamicSizeFactors(): encoder = Model( inputs, [z, z_mean, z_log_var, z_sf, z_mean_sf, z_log_var_sf], name='encoder') else: encoder = Model(inputs, [z, z_mean, z_log_var], name='encoder') else: z = Dense(params['latent_dim'], name='z')(x) if sf.usingDynamicSizeFactors(): encoder = Model(inputs, [z, z_sf, z_mean_sf, z_log_var_sf], name='encoder') else: # 2 outputs to make the rest of the code cleaner for indexing encoder = Model(inputs, [z, z], name='encoder') return encoder, inputs, z_mean, z_log_var, z_mean_sf, z_log_var_sf, z_sf def build_decoder(original_dim, sf, z_sf, count_input, params, debug): latent_inputs = Input(shape=(params['latent_dim'], ), name='z_decoder_input') x = layers(latent_inputs, params, params['hidden_structure'][::-1], 'decoder', encoding=False) sf_l = [] if params['model'] == 'gaussian': # multiply each data point by size factors ? outputs = Dense(original_dim, activation='sigmoid', kernel_initializer=params['init'], name='decoder_output')(x) sf_l = sf.getInputs() elif params['model'] == 'zinb': MeanAct = lambda a: tf.clip_by_value(K.exp(a), 1e-5, 1e6) DispAct = lambda a: tf.clip_by_value(tf.nn.softplus(a), 1e-4, 1e4) ColwiseMultLayer = Lambda( lambda l: l[0] * tf.reshape(l[1], (-1, 1)), name='colwisemult') ExpZsf = Lambda(lambda l: K.exp(l), name='expzsf') PrintMuLayer = Lambda(lambda l: K.print_tensor(l, 'mu')) PrintMuOutLayer = Lambda(lambda l: K.print_tensor(l, 'mu_out')) PrintSfLayer = Lambda(lambda l: K.print_tensor(l, 'sf')) PrintZsfLayer = Lambda(lambda l: K.print_tensor(l, 'zsf')) #kernel_regularizer? mu = Dense(original_dim, activation=MeanAct, kernel_initializer=params['init'], name='mu')(x) dispersion = Dense(original_dim, activation=DispAct, kernel_initializer=params['init'], name='dispersion')(x) pi = Dense(original_dim, activation='sigmoid', kernel_initializer=params['init'], name='pi')(x) if debug: mu = PrintMuLayer(mu) if sf.usingSizeFactors(): if sf.usingDynamicSizeFactors(): sf_l = sf.getInputs() if debug: z_sf = PrintZsfLayer(z_sf) z_sf = ExpZsf(z_sf) mu_out = ColwiseMultLayer([mu, z_sf]) else: sf_l = sf.getInputs() sf_lmult = [None] * len(sf_l) if debug: sf_lmult[0] = PrintSfLayer(sf_l[0]) else: sf_lmult[0] = sf_l[0] mu_out = ColwiseMultLayer([mu, sf_lmult[0]]) else: mu_out = mu if debug: mu_out = PrintMuOutLayer(mu_out) outputs = [mu_out, dispersion, pi] decoder = Model([latent_inputs] + sf_l + [count_input], outputs, name='decoder') if sf.usingDynamicSizeFactors(): return decoder, sf_l[0], sf_l[1] else: return decoder, None, None def build_discriminator(params): model = Sequential() for i, siz in enumerate(params['discriminator_hidden_structure']): model.add( Dense(siz, input_dim=params['latent_dim'] if i == 0 else None)) model.add(LeakyReLU(alpha=0.2)) if i != len(params['discriminator_hidden_structure']) - 1: model.add(Dropout(0.2)) model.add( Dense(1, activation="sigmoid" if not params['discriminator_wasserstein'] else None)) return model def settrainable(model, onoff): for layer in model.layers: layer.trainable = onoff model.trainable = onoff if debug: print('count_dim:', count_dim) if debug and tf_debug: from tensorflow.python import debug as tf_dbg sess = K.get_session() sess = tf_dbg.LocalCLIDebugWrapperSession(sess) sess.add_tensor_filter("has_inf_or_nan", tf_dbg.has_inf_or_nan) K.set_session(sess) sf_inputs = sf.getInputs() raw_count_input = Input(shape=(count_dim, ), name='raw_count_input') encoder, count_input, z_mean, z_log_var, z_mean_sf, z_log_var_sf, z_sf = build_encoder( count_dim, sf, params) decoder, sf_means, sf_log_vars = build_decoder(count_dim, sf, z_sf, count_input, params, debug) encoded_repr = encoder(count_input)[0] outputs = decoder([encoded_repr] + sf_inputs + [count_input]) vae = Model([count_input, raw_count_input] + sf_inputs, outputs, name='vae') if params['model'] == 'gaussian': vae_loss, reconstruction_loss, kl_sf = gaussian_loss( raw_count_input, outputs, z_mean, z_log_var, sf, z_mean_sf, z_log_var_sf, sf_means, sf_log_vars, mod_args, count_dim, params, debug) elif params['model'] == 'zinb': vae_loss, reconstruction_loss, kl_sf = zinb_loss( raw_count_input, #mu, dispersion, pi, outputs[0], outputs[1], outputs[2], z_mean, z_log_var, sf, z_mean_sf, z_log_var_sf, sf_means, sf_log_vars, mod_args, count_dim, params, debug) l = vae_loss if params['architecture'] == 'vae' else reconstruction_loss if sf.usingDynamicSizeFactors() and kl_sf is not None: l += kl_sf vae.add_loss(l) vae.compile(optimizer=params['optimizer'], loss=None) discriminator = None generator = None if params['architecture'] == 'aae': discriminator = build_discriminator(params) def wasserstein_loss(apply_lambda=False, debug=False): def loss(y_true, y_pred): if debug: print('WASSERSTEIN LOSS:', apply_lambda) loss_tensor = tf.nn.sigmoid_cross_entropy_with_logits( labels=y_true, logits=y_pred) if apply_lambda: loss_tensor *= params['wasserstein_lambda'] return loss_tensor return loss discriminator.compile(loss='binary_crossentropy' \ if not params['discriminator_wasserstein'] else wasserstein_loss(apply_lambda=False, debug=debug), optimizer=params['optimizer_discriminator'], metrics=['accuracy']) if debug: print('Trainable Discriminator model') discriminator.summary() settrainable(discriminator, False) #this doesn't work? if sf.usingDynamicSizeFactors(): def list_layers(model): print('list layers') for layer in model.layers: print(layer.name) list_layers(encoder) z_sf.trainable = False z_mean_sf.trainable = False z_log_var_sf.trainable = False for layer in encoder.layers: rex = "encoder_sf" if layer.name[0:len(rex)] == rex: print('untrainable:', layer.name) layer.trainable = False generator = Model(count_input, discriminator(encoder(count_input)[0])) if sf.usingDynamicSizeFactors(): generator.add_loss(K.mean(0 * z_sf)) generator.compile(loss='binary_crossentropy' \ if not params['discriminator_wasserstein'] else wasserstein_loss(apply_lambda=True, debug=debug), optimizer=params['optimizer_generator']) if debug: print('Non-Trainable Discriminator model') discriminator.summary() return vae, encoder, decoder, discriminator, generator
def __init__(self, ftrain_TFR, fvalid_TFR, ftest_TFR, xlen, ylen, olog, x_mean, x_std, y_mean, y_std, x_min, x_max, y_min, y_max, scalelims, ncores, buffer_size, batch_size, nbatches, layers, lay_params, activations, act_params, nodes, lengthscale=1e-3, max_lr=1e-1, clr_mode='triangular', clr_steps=2000, weight_file='weights.h5', stop_file='./STOP', train_flag=True, epsilon=1e-6, debug=False, shuffle=False, resume=False): """ ftrain_TFR : list, strings. TFRecords for the training data. fvalid_TFR : list, strings. TFRecords for the validation data. ftest_TFR : list, strings. TFRecords for the test data. xlen : int. Dimensionality of the inputs. ylen : int. Dimensionality of the outputs. olog : bool. Determines if the target values are log10-scaled. x_mean : array. Mean values of the input data. x_std : array. Stdev values of the input data. y_mean : array. Mean values of the output data. y_std : array. Stdev values of the output data. x_min : array. Minima of the input data. x_max : array. Maxima of the input data. y_min : array. Minima of the output data. y_max : array. Maxima of the output data. scalelims : list, floats. [min, max] of the scaled data range. ncores : int. Number of cores to use for parallel data loading. buffer_size: int. Number of cases to pre-load in memory. batch_size : int. Size of batches for training/validation/testing. nbatches : list, ints. Number of batches in the [training, validation, test] (in that order) sets. layers : list, str. Types of hidden layers. lay_params : list, ints. Parameters for the layer type E.g., kernel size activations: list, str. Activation functions for each hidden layer. act_params : list, floats. Parameters for the activation functions. nodes : list, ints. For the layers with nodes, number of nodes per layer. lengthscale: float. Minimum learning rate. max_lr : float. Maximum learning rate. clr_mode : string. Cyclical learning rate function. clr_steps : int. Number of steps per cycle of learning rate. weight_file: string. Path/to/file to save the NN weights. stop_file : string. Path/to/file to check for manual stopping of training. train_flag : bool. Determines whether to train a model or not. epsilon : float. Added to log() arguments to prevent log(0) debug : bool. If True, turns on Tensorflow's debugger. shuffle : bool. Determines whether to shuffle the data. resume : bool. Determines whether to resume training a model. """ # Make sure everything is on the same graph if not debug and K.backend() == 'tensorflow': K.clear_session() else: sess = K.get_session() sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) K.set_session(sess) # Load data self.X, self.Y = U.load_TFdataset(ftrain_TFR, ncores, batch_size, buffer_size, xlen, ylen, x_mean, x_std, y_mean, y_std, x_min, x_max, y_min, y_max, scalelims, shuffle) self.Xval, self.Yval = U.load_TFdataset(fvalid_TFR, ncores, batch_size, buffer_size, xlen, ylen, x_mean, x_std, y_mean, y_std, x_min, x_max, y_min, y_max, scalelims, shuffle) self.Xte, self.Yte = U.load_TFdataset(ftest_TFR, ncores, batch_size, buffer_size, xlen, ylen, x_mean, x_std, y_mean, y_std, x_min, x_max, y_min, y_max, scalelims, shuffle) # Other variables self.inD = xlen self.outD = ylen self.olog = olog self.y_mean = y_mean self.y_std = y_std self.y_min = y_min self.y_max = y_max self.scalelims = scalelims self.batch_size = batch_size self.train_batches = nbatches[0] self.valid_batches = nbatches[1] self.test_batches = nbatches[2] self.weight_file = weight_file self.stop_file = stop_file self.lengthscale = lengthscale self.max_lr = max_lr self.clr_mode = clr_mode self.clr_steps = clr_steps self.epsilon = epsilon # To ensure log(0) never happens self.train_flag = train_flag self.resume = resume self.shuffle = shuffle ### Build model # Input layer if shuffle: inp = Input(shape=(xlen, ), tensor=self.X) else: inp = Input(shape=(xlen, )) x = inp # Hidden layers n = 0 # Counter for layers with nodes for i in range(len(layers)): if layers[i] == 'conv1d': tshape = tuple(val for val in K.int_shape(x) if val is not None) if i == 0 or (i > 0 and layers[i - 1] != 'conv1d'): # Add channel for convolution x = Reshape(tshape + (1, ))(x) if type(activations[n]) == str: # Simple activation: pass as layer parameter x = Convolution1D(nb_filter=nodes[n], kernel_size=lay_params[i], activation=activations[n], padding='same')(x) else: # Advanced activation: use as its own layer x = Convolution1D(nb_filter=nodes[n], kernel_size=lay_params[i], padding='same')(x) x = activations[n](x) n += 1 elif layers[i] == 'dense': if i > 0: if layers[i - 1] == 'conv1d': print('WARNING: Dense layer follows Conv1d layer. ' \ + 'Flattening.') x = Flatten()(x) if type(activations[n]) == str: x = Dense(nodes[n], activation=activations[n])(x) else: x = Dense(nodes[n])(x) x = activations[n](x) n += 1 elif layers[i] == 'maxpool1d': if layers[i - 1] == 'dense' or layers[i - 1] == 'flatten': raise Exception('MaxPool layers must follow Conv1d or ' \ + 'Pool layer.') x = MaxPooling1D(pool_size=lay_params[i])(x) elif layers[i] == 'avgpool1d': if layers[i - 1] == 'dense' or layers[i - 1] == 'flatten': raise Exception('AvgPool layers must follow Conv1d or ' \ + 'Pool layer.') x = AveragePooling1D(pool_size=lay_params[i])(x) elif layers[i] == 'dropout': if self.train_flag: x = Dropout(lay_params[i])(x) elif layers[i] == 'flatten': x = Flatten()(x) # Output layer out = Dense(ylen)(x) self.model = Model(inp, out) # Compile model if shuffle: self.model.compile(optimizer=adam(lr=self.lengthscale, amsgrad=True), loss=keras.losses.mean_squared_error, target_tensors=[self.Y]) else: self.model.compile(optimizer=adam(lr=self.lengthscale, amsgrad=True), loss=keras.losses.mean_squared_error) print(self.model.summary())
def main(_): cfg_file = 'config.yaml' # hparams hparams = load_config(cfg_file, section='hparams') # logger logger = create_logger('textcnn') # prepare datasets files_path = load_config(cfg_file, section='path') trainset = Dataset(files_path['train_data_path'], logger) num_classes = trainset.num_classes validset = Dataset(files_path['valid_data_path'], logger, dict_class_to_label=trainset.dict_class_to_label) logger.info('dict_class_to_label: %s', trainset.dict_class_to_label) logger.info('trainset label_stat: %s', trainset.label_stat) logger.info('validset label_stat: %s', validset.label_stat) # load vocab, embed vocab = load_vocab(files_path['vocab_path']) word_embed = load_embed(files_path['word_embed_path']) hparams.add_hparam('vocab_size', word_embed.shape[0]) hparams.add_hparam('embed_size', word_embed.shape[1]) # load model logger.info('loading model...') graph = tf.Graph() with graph.as_default(): model = TextCNN(hparams=hparams, num_classes=num_classes, logger=logger) # train model with tf.Session(graph=graph) as sess: # debug if FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess, dump_root='tfdbg') # init model sess.run(tf.global_variables_initializer()) logger.info('params initialized') # create a saver saver = tf.train.Saver() save_path = files_path['save_model_path'] os.makedirs(os.path.dirname(save_path), exist_ok=True) # performance of the model before training loss_valid, acc = evaluate(sess, model, validset, vocab, word_embed) logger.info('loss_valid: %.4f\tacc: %.4f', loss_valid, acc) best_result = {'loss_valid': loss_valid, 'acc': acc} patience = 0 # train model for id_epoch in range(hparams.num_epoch): train_epoch(sess, model, trainset, vocab, word_embed, id_epoch, hparams) # train epoch loss_valid, acc = evaluate(sess, model, validset, vocab, word_embed) # evaluate logger.info('Epoch: %d\tloss_valid: %.4f\tacc: %.4f', id_epoch + 1, loss_valid, acc) if loss_valid < best_result['loss_valid']: # save model saver.save(sess=sess, save_path=save_path) logger.info('model saved in %s', save_path) best_result = {'loss_valid': loss_valid, 'acc': acc} patience = 0 else: # early stopping patience += 1 if patience >= hparams.earlystop_patience: logger.info('earlystop.') logger.info('Best result: loss_valid: %.4f\tacc: %.4f', best_result['loss_valid'], best_result['acc']) break