def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs, stochastic_policy, save=False, reuse=False): # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space U.initialize() policy = build_policy(env, 'mlp', value_network='copy') ob = observation_placeholder(ob_space) with tf.variable_scope('pi'): pi = policy(observ_placeholder=ob) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(load_model_path) saver.restore(U.get_session(), ckpt.model_checkpoint_path) obs_list = [] acs_list = [] len_list = [] ret_list = [] from tqdm import tqdm for _ in tqdm(range(number_trajs)): traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy) obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj[ 'ep_len'], traj['ep_ret'] obs_list.append(obs) acs_list.append(acs) len_list.append(ep_len) ret_list.append(ep_ret) if stochastic_policy: print('stochastic policy:') else: print('deterministic policy:') if save: filename = load_model_path.split('/')[-1] + '.' + env.spec.id np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list), lens=np.array(len_list), rets=np.array(ret_list)) avg_len = sum(len_list) / len(len_list) avg_ret = sum(ret_list) / len(ret_list) # print("Average length:", avg_len) # print("Average return:", avg_ret) return avg_len, avg_ret
def add_all_summary(self, writer, values, iter): # Note that the order of the incoming ```values``` should be the same as the that of the # ```scalar_keys``` given in ```__init__``` if np.sum(np.isnan(values)+0) != 0: return sess = U.get_session() keys = self.scalar_summaries_ph + self.histogram_summaries_ph feed_dict = {} for k, v in zip(keys, values): feed_dict.update({k: v}) summaries_str = sess.run(self.summaries, feed_dict) writer.add_summary(summaries_str, iter)
def run_eval(config): def create_loss(): test_dataset_args = create_dataset(config, split="test", shuffle=True, repeat=True) test_inputs, test_targets, test_lengths, _, _, _ = test_dataset_args cell = create_cell(config, test_dataset_args) test_ll_per_seq, kl, log_weight, log_ess, trajectories = \ basic_bounds.iwae(cell, (test_inputs, test_targets), test_lengths, num_samples=config.num_samples) test_ll_per_t = tf.reduce_mean(test_ll_per_seq / tf.to_float(test_lengths)) return test_ll_per_t, trajectories, cell, log_weight, test_lengths def create_graph(): global_step = tf.train.get_or_create_global_step() test_bound, trajectories, cell, log_weight, test_lengths = create_loss return cell, test_bound, global_step, trajectories, test_lengths cell, test_bound, global_step, trajectories, test_lengths = create_graph() sess = U.get_session() U.initialize() cur_step = 0 # saver saver = tf.train.Saver(max_to_keep=1) if not os.path.exists(config.logdir): assert False ckpt = tf.train.get_checkpoint_state(config.logdir + '/valid_best') if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) cur_step = int(ckpt.model_checkpoint_path.split('-')[-1]) print('Model and log loaded! (checkpoint_path=%s, cur_step=%d)' % (ckpt.model_checkpoint_path, cur_step)) test_bound_value = sess.run([test_bound]) print("##################################") print("VALID_BEST_STEP: %s" % cur_step) print("PARTICLE_NUM: %s" % config.num_samples) print("TEST_BOUND: %s" % test_bound_value[0]) print("##################################")
def main(_): # create visualizer #visualizer = TensorboardVisualizer() monitor = Monitor(FLAGS) #log_dir = monitor.log_dir #visualizer.initialize(log_dir, None) saved_mean_reward = None # openAI logger L.configure(monitor.log_dir, format_strs=['stdout', 'csv']) # initialize env atari_env = AtariEnv(monitor) #screen_shot_subgoal(atari_env) # we should probably follow deepmind style env # stack 4 frames and scale float env = wrapper.wrap_deepmind(atari_env, frame_stack=True, scale=True) # get default tf_session sess = U.get_session() # create q networks for controller controller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) controller_network = Q_network(env.observation_space, env.action_space.n, controller_optimizer, scope='controller') controller = Controller(controller_network, env.action_space.n) # create q networks for meta-controller num_goals = env.unwrapped.goals_space.n metacontroller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) metacontroller_network = Q_network(env.observation_space, num_goals, metacontroller_optimizer, scope='metacontroller') metacontroller = MetaController(metacontroller_network, num_goals) # Create the schedule for exploration starting from 1. exploration2 = LinearSchedule(schedule_timesteps=int(EXPLORATION_FRACTION * monitor.num_timesteps), initial_p=1.0, final_p=EXPLORATION_FINAL_EPS) # initialize experience replay controller_replay_buffer = ReplayBuffer(D1_MEMORY_SIZE) metacontroller_replay_buffer = ReplayBuffer(D2_MEMORY_SIZE) # initialize critic critic = Critic(env.unwrapped) total_extrinsic_reward = [] # for success rate total_goal_reached = np.zeros(num_goals, dtype=np.int32) total_goal_sampled = np.zeros(num_goals, dtype=np.int32) total_goal_epsilon = np.ones(num_goals, dtype=np.float32) ep = 0 total_step = 0 init_ob = env.reset() U.initialize() # initialize target network in both controller and meta sess.run(metacontroller.network.update_target_op) sess.run(controller.network.update_target_op) # load ckpt if presence model_path = tf.train.latest_checkpoint(monitor.ckpt_dir) model_saved = False model_file = os.path.join(monitor.ckpt_dir, 'model') if model_path is not None: U.load_variables(model_file) L.log('loaded model from %s' % model_file) model_saved = True while ep < MAX_EPISODE: # count number of steps # init environment game play variables init_ob = env.reset() observation = np.reshape(init_ob['observation'], (1, )+init_ob['observation'].shape) desired_goal = metacontroller.sample_act(sess, observation, update_eps=1.0)[0] env.unwrapped.desired_goal = desired_goal total_goal_sampled[desired_goal] += 1 # given predicted goal, we encode this goal bounding mask to the observation np array ob_with_g = env.unwrapped._add_goal_mask(init_ob['observation'], desired_goal) # NOTE: Below code verify added mask correctly # for i in range(ob_with_g.shape[-1]): # ob = ob_with_g[:,:,i] # image = Image.fromarray(ob) # image = image.convert('RGB') # image.save('test_%i.png' % i) done = False reached_goal = False while not done: extrinsic_rewards = 0 s0 = init_ob['observation'] while not (done or reached_goal): update_eps1_with_respect_to_g = get_epsilon(total_goal_epsilon, total_goal_reached, total_goal_sampled, desired_goal, total_step, EXPLORATION_WARM_UP) ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape) primitive_action_t = controller.sample_act(sess, ob_with_g_reshaped, update_eps=update_eps1_with_respect_to_g)[0] # obtain extrinsic reward from environment ob_tp1, extrinsic_reward_t, done_t, info = env.step(primitive_action_t) reached_goal = env.unwrapped.reached_goal(desired_goal) ob_with_g_tp1 = env.unwrapped._add_goal_mask(ob_tp1['observation'], desired_goal) intrinsic_reward_t = critic.criticize(desired_goal, reached_goal, primitive_action_t, done_t) controller_replay_buffer.add(ob_with_g, primitive_action_t, intrinsic_reward_t, ob_with_g_tp1, done_t) # sample from replay_buffer1 to train controller obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t = controller_replay_buffer.sample(TRAIN_BATCH_SIZE) weights, batch_idxes = np.ones_like(intrinsic_rewards_t), None # get q estimate for tp1 as 'supervised' ob_with_g_tp1_reshaped = np.reshape(ob_with_g_tp1, (1, )+ob_with_g.shape) q_tp1 = controller.get_q(sess, ob_with_g_tp1_reshaped)[0] td_error = controller.train(sess, obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t, weights, q_tp1) # join train meta-controller only sample from replay_buffer2 to train meta-controller if total_step >= WARMUP_STEPS: L.log('join train has started ----- step %d', total_step) # sample from replay_buffer2 to train meta-controller init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t = metacontroller_replay_buffer.sample(TRAIN_BATCH_SIZE) weights, batch_idxes = np.ones_like(extrinsic_rewards_t), None # get q estimate for tp1 as 'supervised' obs_terminate_in_g_reshaped = np.reshape(obs_terminate_in_g, (1, )+obs_terminate_in_g.shape) q_tp1 = metacontroller.get_q(sess, obs_terminate_in_g_reshaped)[0] td_error = metacontroller.train(sess, init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t, weights, q_tp1) if total_step % UPDATE_TARGET_NETWORK_FREQ == 0: #L.log('UPDATE BOTH CONTROLLER Q NETWORKS ----- step %d', step) sess.run(controller.network.update_target_op) # its fine, we aren't really training meta dqn until after certain steps. sess.run(metacontroller.network.update_target_op) extrinsic_rewards += extrinsic_reward_t ob_with_g = ob_with_g_tp1 done = done_t total_step += 1 # we are done / reached_goal # store transitions of init_ob, goal, all the extrinsic rewards, current ob in D2 # print("ep %d : step %d, goal extrinsic total %d" % (ep, step, extrinsic_rewards)) # clean observation without goal encoded metacontroller_replay_buffer.add(init_ob['observation'], desired_goal, extrinsic_rewards, ob_tp1['observation'], done) # if we are here then we have finished the desired goal if not done: #print("ep %d : goal %d reached, not yet done, extrinsic %d" % (ep, desired_goal, extrinsic_rewards)) exploration_ep = 1.0 total_goal_reached[env.unwrapped.achieved_goal] += 1 if total_step >= WARMUP_STEPS: t = total_step - WARMUP_STEPS exploration_ep = exploration2.value(t) ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape) while env.unwrapped.achieved_goal == desired_goal: desired_goal = metacontroller.sample_act(sess, ob_with_g_reshaped, update_eps=exploration_ep)[0] env.unwrapped.desired_goal = desired_goal total_goal_sampled[desired_goal] += 1 L.log('ep %d : achieved goal was %d ----- new goal --- %d' % (ep, env.unwrapped.achieved_goal, desired_goal)) # start again reached_goal = False # finish an episode total_extrinsic_reward.append(extrinsic_rewards) ep += 1 mean_100ep_reward = round(np.mean(total_extrinsic_reward[-101:-1]), 1) if ep % monitor.print_freq == 0 : L.record_tabular("steps", total_step) L.record_tabular("episodes", ep) L.record_tabular("mean 100 episode reward", mean_100ep_reward) L.dump_tabular() if total_step % monitor.ckpt_freq == 0: if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: L.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward # verified our model was saved if model_saved: L.log('restored model with mean reward: %d' % saved_mean_reward) U.load_variables(model_file)
def learn( *, network, env, eval_policy, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, checkpoint_path_in=None, checkpoint_dir_out=None, checkpoint_freq=100, # In iterations!!, from_iter=0, eval_episodes=20, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- # ob_space = Box(low=-np.inf, high=np.inf, shape=(env.observation_space.n,)) ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) # Loading checkpoint if checkpoint_path_in is not None and os.path.isfile(checkpoint_path_in): pi.load(checkpoint_path_in) logger.log('Loaded policy weights from %s' % checkpoint_path_in) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() # s = env.reset() # start = time.time() # for i in range(10000): # pi.step(s, stochastic=True) # duration = time.time() - start # print(duration) # return if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, gamma=gamma) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 iters_eval = 0 all_logs = [] best_rew = -np.inf tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards online_scores = [] offline_scores = [] if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) if iters_so_far % checkpoint_freq == 0 and checkpoint_dir_out is not None: if not os.path.exists(checkpoint_dir_out): os.makedirs(checkpoint_dir_out) pi.save( os.path.join(checkpoint_dir_out, 'checkpoint_%d' % iters_so_far)) logger.log('Saved policy weights as %s' % os.path.join( checkpoint_dir_out, 'checkpoint_%d.npy' % iters_so_far)) def pi_wrapper(ob): ac, vpred, _, _ = pi.step(ob, stochastic=True) return ac rew, _, logs, disc_rets, num_stops, avg_damages = eval_policy( pi=pi_wrapper, n_episodes=eval_episodes, verbose=True) offline_scores.append( [np.mean(disc_rets), np.mean(num_stops), np.mean(avg_damages)]) np.save(os.path.join(checkpoint_dir_out, 'offline_scores.npy'), offline_scores) for log in logs: log['iter'] = iters_eval all_logs = all_logs + logs iters_eval += 1 with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) ep_rew_mean = np.mean(rewbuffer) online_scores.append(ep_rew_mean) np.save(os.path.join(checkpoint_dir_out, 'online_scores.npy'), online_scores) # Saving best if iters_so_far % checkpoint_freq == 0 and ep_rew_mean > best_rew and checkpoint_dir_out is not None: pi.save(os.path.join(checkpoint_dir_out, 'best')) best_rew = ep_rew_mean logger.log('Saved policy weights as %s' % os.path.join(checkpoint_dir_out, 'best.npy')) if rank == 0: logger.dump_tabular() return pi
def run(config): def create_loss(): train_dataset_args = create_dataset(config, split="train", shuffle=True, repeat=True) test_dataset_args = create_dataset(config, split="test", shuffle=True, repeat=True) valid_dataset_args = create_dataset(config, split="valid", shuffle=True, repeat=True) inputs, targets, lengths, params, _, _ = train_dataset_args test_inputs, test_targets, test_lengths, _, _, _ = test_dataset_args valid_inputs, valid_targets, valid_lengths, _, _, _ = valid_dataset_args cell = create_cell(config, train_dataset_args) if config.bound == "iwae": ll_per_seq, kl, log_weight, log_ess, trajectories = \ basic_bounds.iwae(cell, (inputs, targets), lengths, num_samples=config.num_samples) else: raise ValueError("Undefined bound %s" % config.bound) if config.test_bound == "iwae": valid_ll_per_seq, _, _, _, _ = \ basic_bounds.iwae(cell, (valid_inputs, valid_targets), valid_lengths, num_samples=config.test_num_samples) else: raise ValueError("Undefined bound %s" % config.test_bound) ll_per_t = tf.reduce_mean(ll_per_seq / tf.to_float(lengths)) valid_ll_per_t = tf.reduce_mean(valid_ll_per_seq / tf.to_float(valid_lengths)) return cell, ll_per_t, valid_ll_per_t, trajectories, lengths def create_graph(): global_step = tf.train.get_or_create_global_step() cell, bound, valid_bound, trajectories, lengths = create_loss() loss = -bound opt = tf.train.AdamOptimizer(config.learning_rate) if config.model_train: grad_theta = opt.compute_gradients(loss, var_list=tf.trainable_variables( "%s/theta" % config.cell)) train_op_theta = opt.apply_gradients(grad_theta, global_step=global_step) else: train_op_theta = tf.constant(1) if config.algorithm == 'reparam': grad_phi = opt.compute_gradients( loss, var_list=tf.trainable_variables('prop_phi')) train_op_phi = opt.apply_gradients(grad_phi, global_step=global_step) else: train_op_phi = tf.constant(1) return cell, bound, valid_bound, trajectories, lengths, train_op_theta, train_op_phi, global_step valid_best = -1000000 cell, bound, valid_bound, trajectories, lengths, train_op_theta, train_op_phi, global_step = create_graph( ) sess = U.get_session() U.initialize() cur_step = 0 saver = tf.train.Saver(max_to_keep=1) valid_saver = tf.train.Saver(max_to_keep=1) model_savepath = config.logdir + '/model.ckpt' valid_best_model_savepath = config.logdir + '/valid_best/valid_best_model.ckpt' if not os.path.exists(config.logdir): os.makedirs(config.logdir) os.makedirs(config.logdir + '/valid_best') ckpt = tf.train.get_checkpoint_state(config.logdir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) valid_saver.restore(sess, ckpt.model_checkpoint_path) cur_step = int(ckpt.model_checkpoint_path.split('-')[-1]) print('Model and log loaded! (checkpoint_path=%s, cur_step=%d)' % (ckpt.model_checkpoint_path, cur_step)) while cur_step < config.max_iter + 1: if config.algorithm == 'reparam': _, _, bound_value, valid_bound_value = sess.run( [train_op_theta, train_op_phi, bound, valid_bound]) elif "reinforce" in config.algorithm or "vimco" in config.algorithm or "vifle" in config.algorithm or "fr" in config.algorithm: _, bound_value, raw_seg, valid_bound_value, run_lengths = sess.run( [train_op_theta, bound, trajectories, valid_bound, lengths]) cell.prop_update.update(raw_seg, run_lengths) else: raise ValueError("Undefined algorithm %s" % config.algorithm) if valid_bound_value > valid_best and cur_step > config.init_steps: valid_best = valid_bound_value valid_best_model_saved_path = valid_saver.save( sess, valid_best_model_savepath, global_step=cur_step) print('Model saved: %s' % valid_best_model_saved_path) # for save - current work if cur_step % config.save_every == 0: model_saved_path = saver.save(sess, model_savepath, global_step=cur_step) print('Model saved: %s' % model_saved_path) cur_step += 1
def main(): L.configure('/home/metalabadmin/exp/freeway', format_strs=['stdout', 'csv', 'tensorboard']) env = gym.make('Freeway-v0') env = wrapper.wrap_deepmind(env, frame_stack=True, scale=True) optimizer = tf.train.AdamOptimizer(learning_rate=0.0001) network = Q_network(env.observation_space, env.action_space.n, optimizer, gamma=0.99, scope='freeway') m_controller = MetaController(network, env.action_space.n) # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(0.1 * 1e7), initial_p=1.0, final_p=0.02) replay = ReplayBuffer(50000) # get default tf_session sess = U.get_session() U.initialize() sess.run(m_controller.network.update_target_op) step = 0 episodes = 0 rewards = 0 mean_100ep_reward = 0 total_reward = [] saved_mean_reward = None ob = env.reset() while step <= 1e7: ep = exploration.value(step) ob_reshaped = np.reshape(ob, (1, ) + env.observation_space.shape) act = m_controller.sample_act(sess, ob_reshaped, update_eps=ep)[0] ob_tp1, reward_t, done_t, info = env.step(act) env.render() rewards += reward_t replay.add(ob, act, reward_t, ob_tp1, float(done_t)) ob = ob_tp1 # train every 4 steps if step >= 1000 and step % 4 == 0: obs, acts, rewards_t, obs_tp1, dones_t = replay.sample(64) weights, batch_idxes = np.ones_like(rewards_t), None # get q estimate for tp1 as 'supervised' obs_tp1_reshaped = np.reshape(obs_tp1, (64, ) + env.observation_space.shape) q_tp1 = m_controller.get_q(sess, obs_tp1_reshaped)[0] td_error = m_controller.train(sess, obs, acts, rewards_t, obs_tp1, dones_t, weights, q_tp1) step += 1 if step >= 1000 and step % 1000 == 0: sess.run(m_controller.network.update_target_op) if done_t: ob = env.reset() total_reward.append(rewards) episodes += 1 rewards = 0 print('step %d done %s, ep %.2f' % (step, str(done_t), ep)) mean_100ep_reward = round(np.mean(total_reward[-101:-1]), 1) if episodes % 10 == 0 and episodes != 0: print('date time %s' % str(datetime.now())) L.record_tabular("steps", step) L.record_tabular("episodes", episodes) L.record_tabular("mean 100 episode reward", mean_100ep_reward) L.dump_tabular() if step % 1000 == 0: if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: L.log("Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) U.save_variables('./freewaymodel.ckpt') model_saved = True saved_mean_reward = mean_100ep_reward