def valid(self, name, sess, valid_feed): elbo_losses = [] rc_losses = [] rc_ppls = [] bow_losses = [] kl_losses = [] while True: batch = valid_feed.next_batch() if batch is None: break feed_dict = self.batch_2_feed(batch, None, use_prior=False, repeat=1) elbo_loss, bow_loss, rc_loss, rc_ppl, kl_loss = sess.run( [self.elbo, self.avg_bow_loss, self.avg_rc_loss, self.rc_ppl, self.avg_kld], feed_dict) elbo_losses.append(elbo_loss) rc_losses.append(rc_loss) rc_ppls.append(rc_ppl) bow_losses.append(bow_loss) kl_losses.append(kl_loss) avg_losses = self.print_loss(name, ["elbo_loss", "bow_loss", "rc_loss", "rc_peplexity", "kl_loss"], [elbo_losses, bow_losses, rc_losses, rc_ppls, kl_losses], "") logger.record_tabular("elbo_loss", avg_losses[0]) logger.record_tabular("bow_loss", avg_losses[1]) logger.record_tabular("rc_loss", avg_losses[2] ) logger.record_tabular("rc_peplexity", avg_losses[3]) logger.record_tabular("kl_loss", avg_losses[4]) logger.dump_tabular() return avg_losses[0]
def train(self): self.sess.run(tf.global_variables_initializer()) self.start_worker() start_time = time.time() total_samples = 0 for itr in range(0, self.n_itr): itr_start_time = time.time() logger.info('\n itr #%d' % itr) logger.info("Obtaining samples...") paths, n_samples = self.obtain_samples(itr) total_samples += n_samples logger.info("Processing samples...") samples_data = self.process_samples(itr, paths) logger.info("Optimizing policy...") self.optimize_policy(itr, samples_data) logger.info("Update stats...") self.update_stats(paths) logger.info("Fitting baseline...") self.fit_baseline(paths) logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular() self.shutdown_worker()
def log_tabular_results(returns, itr, train_collection): logger.clear_tabular() logger.record_tabular('Iteration', itr) logger.record_tabular('episode_mean', np.mean(returns)) logger.record_tabular('episode_min', np.min(returns)) logger.record_tabular('episode_max', np.max(returns)) logger.record_tabular('TotalSamples', train_collection.get_total_samples()) logger.dump_tabular()
def log_tabular_results(returns, itr, train_collection): logger.clear_tabular() logger.record_tabular('Iteration', itr) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('MinimumReturn', np.min(returns)) logger.record_tabular('MaximumReturn', np.max(returns)) logger.record_tabular('TotalSamples', train_collection.get_total_samples()) logger.dump_tabular()
def validate(val_loader, model, criterion, epoch): batch_time = 0 #AverageMeter() data_time = 0 #AverageMeter() losses = 0 #AverageMeter() all_accs = 0 #AverageMeter() cls_accs = 0 #AverageMeter() # switch to evaluate mode model.eval() end = time.time() for i, (input, target) in enumerate(val_loader): target = target.cuda() #(async=True) #target = target.cuda(async=True) input_var = torch.autograd.Variable(input, volatile=True) target_var = torch.autograd.Variable(target, volatile=True) # compute output output = model(input_var) loss = criterion(output, target_var) # measure accuracy and record loss all_acc, cls_acc = pascal_accuracy(output.data, target) # prec1, prec5 = pascal_accuracy(output.data, target, topk=(1, 5)) losses += loss all_accs += all_acc cls_accs += cls_acc # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: abs_batch_time = batch_time / (i + 1) abs_data_time = data_time / (i + 1) abs_losses = losses.item() / (i + 1) abs_all_accs = all_accs.item() / (i + 1) logger.log( 'Epoch: [{}][{}/{}]\t Time {}\t Data {}\t Loss {}\t All acs {} ' .format(epoch, i, len(train_loader), abs_batch_time, abs_data_time, abs_losses, abs_all_accs)) logger.log((cls_accs / (i + 1))) logger.record_tabular('val/loss', loss.item()) logger.record_tabular('val/accum_loss', abs_losses) logger.record_tabular('val/accum_all_acces', abs_all_accs) for i in range(cls_accs.shape[0]): logger.record_tabular('val/accum_cls_accs_{}'.format(i), cls_accs[i].item() / (i + 1)) logger.record_tabular('val/cls_accs_{}'.format(i), cls_acc[i].item()) logger.dump_tabular() return all_accs.item() / (i + 1)
def main(): logger.configure('logs/simulate') global T, n_bills, n_taxis, occupied results = [] for n_lanes in range(2, 10): bills, n_taxis_left, n_passengers_left = [], [], [] for seed in range(N_RUNS): np.random.seed(seed) occupied = [False for _ in range(n_lanes + 1)] T, n_bills, n_taxis, sta = 0, 0, 0, 0 lanes = [ Lane(i, n_lanes + 1, lam=0.1 / n_lanes) for i in range(n_lanes) ] enter = np.random.poisson(0.1, size=10000) while T < 10000: if sta == 0: if n_taxis < M: n_taxis += enter[T] else: sta = 1 elif n_taxis < N: sta = 0 for lane in lanes: lane.step() T += 1 bills.append(n_bills) n_taxis_left.append(n_taxis) n_passengers_left.append( np.sum([lane.n_passengers for lane in lanes])) results.append(bills) logger.record_tabular('lanes', n_lanes) logger.record_tabular('bills mean', np.mean(bills)) logger.record_tabular('bills std', np.std(bills)) logger.record_tabular('taxis mean', np.mean(n_taxis_left)) logger.record_tabular('passengers mean', np.mean(n_passengers_left)) logger.dump_tabular() df = pd.DataFrame(np.reshape(results, -1)).rename(columns={0: '# bills'}) df.insert(0, '# lanes', [i for i in range(2, 10) for _ in range(N_RUNS)], True) sns.boxplot(x='# lanes', y='# bills', data=df, showmeans=True, meanline=True) plt.grid(linestyle='--') plt.savefig('logs/simulate/boxplot.jpg') plt.show()
def main(policy_file, seed, n_test_rollouts, render): set_global_seeds(seed) # Load policy. with open(policy_file, 'rb') as f: policy = pickle.load(f) env_name = policy.info['env_name'] # Prepare params. params = config.DEFAULT_PARAMS if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params['env_name'] = env_name params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'compute_Q': True, 'rollout_batch_size': 1, 'render': bool(render), } for name in ['T', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) # Run evaluation. evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs for key, val in evaluator.logs('test'): logger.record_tabular(key, np.mean(val)) logger.dump_tabular()
def evaluate(env, bc_agent_wrapper, num_trajs, render, exact_model_path=None, model_ckpt_dir=None): """Evaluate a trained SAM agent""" # Only one of the two arguments can be provided assert sum([exact_model_path is None, model_ckpt_dir is None]) == 1 # Rebuild the computational graph pol = bc_agent_wrapper('pol') # Create episode generator traj_gen = traj_ep_generator(env, pol, render) # Initialize and load the previously learned weights into the freshly re-built graph U.initialize() if exact_model_path is not None: U.load_model(exact_model_path) logger.info( "model loaded from exact path:\n {}".format(exact_model_path)) else: # `exact_model_path` is None -> `model_ckpt_dir` is not None U.load_latest_checkpoint(model_ckpt_dir) logger.info("model loaded from ckpt dir:\n {}".format(model_ckpt_dir)) # Initialize the history data structures ep_lens = [] ep_env_rets = [] # Collect trajectories for i in range(num_trajs): logger.info("evaluating [{}/{}]".format(i + 1, num_trajs)) traj = traj_gen.__next__() ep_len, ep_env_ret = traj['ep_len'], traj['ep_env_ret'] # Aggregate to the history data structures ep_lens.append(ep_len) ep_env_rets.append(ep_env_ret) # Log some statistics of the collected trajectories ep_len_mean = np.mean(ep_lens) ep_env_ret_mean = np.mean(ep_env_rets) logger.record_tabular("ep_len_mean", ep_len_mean) logger.record_tabular("ep_env_ret_mean", ep_env_ret_mean) logger.dump_tabular()
def learn(env, model_path, data_path, policy_fn, *, rolloutSize, num_options=4, horizon=80, clip_param=0.025, ent_coeff=0.01, # clipping parameter epsilon, entropy coeff optim_epochs=10, mainlr=3.25e-4, intlr=1e-4, piolr=1e-4, termlr=5e-7, optim_batchsize=100, # optimization hypers gamma=0.99, lam=0.95, # advantage estimation max_iters=20, # time constraint adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) retrain=False, ): """ Core learning function """ ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space, num_options=num_options) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space, num_options=num_options) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None]) op_adv = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) betas = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) # Setup losses and stuff kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-ent_coeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] term_loss = pi.tpred * term_adv activated_options = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) pi_w = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) option_hot = tf.one_hot(option, depth=num_options) pi_I = (pi.intfc * activated_options) * pi_w / tf.expand_dims( tf.reduce_sum((pi.intfc * activated_options) * pi_w, axis=1), 1) pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6) int_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv) intfc = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) pi_I = (intfc * activated_options) * pi.op_pi / tf.expand_dims( tf.reduce_sum((intfc * activated_options) * pi.op_pi, axis=1), 1) pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6) op_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv) log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0)) op_entropy = -tf.reduce_mean(pi.op_pi * log_pi, reduction_indices=1) op_loss -= 0.01 * tf.reduce_sum(op_entropy) var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option], losses + [U.flatgrad(total_loss, var_list)]) termgrad = U.function([ob, option, term_adv], [U.flatgrad(term_loss, var_list)]) # Since we will use a different step size. opgrad = U.function([ob, option, betas, op_adv, intfc, activated_options], [U.flatgrad(op_loss, var_list)]) # Since we will use a different step size. intgrad = U.function([ob, option, betas, op_adv, pi_w, activated_options], [U.flatgrad(int_loss, var_list)]) # Since we will use a different step size. adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=5) # rolling buffer for episode lengths rewbuffer = deque(maxlen=5) # rolling buffer for episode rewards datas = [0 for _ in range(num_options)] if retrain: print("Retraining to New Task !! ") time.sleep(2) U.load_state(model_path+'/') p = [] max_timesteps = int(horizon * rolloutSize * max_iters) while True: if max_iters and iters_so_far >= max_iters: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) render = False rollouts = sample_trajectory(pi, env, horizon=horizon, rolloutSize=rolloutSize, render=render) # Save rollouts data = {'rollouts': rollouts} p.append(data) del data data_file_name = data_path + 'rollout_data.pkl' pickle.dump(p, open(data_file_name, "wb")) add_vtarg_and_adv(rollouts, gamma, lam, num_options) opt_d = [] for i in range(num_options): dur = np.mean(rollouts['opt_dur'][i]) if len(rollouts['opt_dur'][i]) > 0 else 0. opt_d.append(dur) ob, ac, opts, atarg, tdlamret = rollouts["ob"], rollouts["ac"], rollouts["opts"], rollouts["adv"], rollouts["tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values # Optimizing the policy for opt in range(num_options): indices = np.where(opts == opt)[0] print("Option- ", opt, " Batch Size: ", indices.size) opt_d[opt] = indices.size if not indices.size: continue datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) if indices.size < optim_batchsize: print("Too few samples for opt - ", opt) continue optim_batchsize_corrected = optim_batchsize optim_epochs_corrected = np.clip(np.int(indices.size / optim_batchsize_corrected), 1, optim_epochs) print("Optim Epochs:", optim_epochs_corrected) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs_corrected): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize_corrected): *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt]) adam.update(grads, mainlr * cur_lrmult) losses.append(newlosses) # Optimize termination functions termg = termgrad(rollouts["ob"], rollouts['opts'], rollouts["op_adv"])[0] adam.update(termg, termlr) # Optimize interest functions intgrads = intgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["op_probs"], rollouts["activated_options"])[0] adam.update(intgrads, intlr) # Optimize policy over options opgrads = opgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["intfc"], rollouts["activated_options"])[0] adam.update(opgrads, piolr) lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("Success", rollouts["success"]) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return pi
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def main(): # config for training config = Config() config.use_bow = False # config for validation valid_config = Config() valid_config.keep_prob = 1.0 valid_config.dec_keep_prob = 1.0 valid_config.batch_size = 60 valid_config.use_bow = False # configuration for testing test_config = Config() test_config.keep_prob = 1.0 test_config.dec_keep_prob = 1.0 test_config.batch_size = 1 test_config.use_bow = False pp(config) # get data set api = SWDADialogCorpus(FLAGS.data_dir, word2vec=FLAGS.word2vec_path, word2vec_dim=config.embed_size) dial_corpus = api.get_dialog_corpus() meta_corpus = api.get_meta_corpus() train_meta, valid_meta, test_meta = meta_corpus.get( "train"), meta_corpus.get("valid"), meta_corpus.get("test") train_dial, valid_dial, test_dial = dial_corpus.get( "train"), dial_corpus.get("valid"), dial_corpus.get("test") # convert to numeric input outputs that fits into TF models train_feed = SWDADataLoader("Train", train_dial, train_meta, config) valid_feed = SWDADataLoader("Valid", valid_dial, valid_meta, config) test_feed = SWDADataLoader("Test", test_dial, test_meta, config) # begin training sess_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) # sess_config.gpu_options.allow_growth = True sess_config.gpu_options.per_process_gpu_memory_fraction = 0.45 with tf.Session(config=sess_config) as sess: initializer = tf.random_uniform_initializer(-1.0 * config.init_w, config.init_w) scope = "model" with tf.variable_scope(scope, reuse=None, initializer=initializer): model = KgRnnCVAE(sess, config, api, log_dir=None if FLAGS.forward_only else log_dir, forward=False, scope=scope) with tf.variable_scope(scope, reuse=True, initializer=initializer): valid_model = KgRnnCVAE(sess, valid_config, api, log_dir=None, forward=False, scope=scope) with tf.variable_scope(scope, reuse=True, initializer=initializer): test_model = KgRnnCVAE(sess, test_config, api, log_dir=None, forward=True, scope=scope) test_model.prepare_mul_ref() logger.info("Created computation graphs") if api.word2vec is not None and not FLAGS.forward_only: logger.info("Loaded word2vec") sess.run(model.embedding.assign(np.array(api.word2vec))) # write config to a file for logging if not FLAGS.forward_only: with open(os.path.join(log_dir, "run.log"), "wb") as f: f.write(pp(config, output=False)) # create a folder by force ckp_dir = os.path.join(log_dir, "checkpoints") if not os.path.exists(ckp_dir): os.mkdir(ckp_dir) ckpt = tf.train.get_checkpoint_state(ckp_dir) logger.info("Created models with fresh parameters.") sess.run(tf.global_variables_initializer()) if ckpt: logger.info("Reading dm models parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) if not FLAGS.forward_only: dm_checkpoint_path = os.path.join( ckp_dir, model.__class__.__name__ + ".ckpt") global_t = 1 patience = 10 # wait for at least 10 epoch before stop dev_loss_threshold = np.inf best_dev_loss = np.inf for epoch in range(config.max_epoch): logger.info(">> Epoch %d with lr %f" % (epoch, sess.run(model.learning_rate_cyc, {model.global_t: global_t}))) # begin training if train_feed.num_batch is None or train_feed.ptr >= train_feed.num_batch: train_feed.epoch_init(config.batch_size, config.backward_size, config.step_size, shuffle=True) global_t, train_loss = model.train( global_t, sess, train_feed, update_limit=config.update_limit) # begin validation logger.record_tabular("Epoch", epoch) logger.record_tabular("Mode", "Val") valid_feed.epoch_init(valid_config.batch_size, valid_config.backward_size, valid_config.step_size, shuffle=False, intra_shuffle=False) valid_loss = valid_model.valid("ELBO_VALID", sess, valid_feed) logger.record_tabular("Epoch", epoch) logger.record_tabular("Mode", "Test") test_feed.epoch_init(valid_config.batch_size, valid_config.backward_size, valid_config.step_size, shuffle=False, intra_shuffle=False) valid_model.valid("ELBO_TEST", sess, test_feed) # test_feed.epoch_init(test_config.batch_size, test_config.backward_size, # test_config.step_size, shuffle=True, intra_shuffle=False) # test_model.test_mul_ref(sess, test_feed, num_batch=5) done_epoch = epoch + 1 # only save a models if the dev loss is smaller # Decrease learning rate if no improvement was seen over last 3 times. if config.op == "sgd" and done_epoch > config.lr_hold: sess.run(model.learning_rate_decay_op) if valid_loss < best_dev_loss: if valid_loss <= dev_loss_threshold * config.improve_threshold: patience = max(patience, done_epoch * config.patient_increase) dev_loss_threshold = valid_loss # still save the best train model if FLAGS.save_model: logger.info("Save model!!") model.saver.save(sess, dm_checkpoint_path) best_dev_loss = valid_loss if (epoch % 3) == 2: tmp_model_path = os.path.join( ckp_dir, model.__class__.__name__ + str(epoch) + ".ckpt") model.saver.save(sess, tmp_model_path) if config.early_stop and patience <= done_epoch: logger.info("!!Early stop due to run out of patience!!") break logger.info("Best validation loss %f" % best_dev_loss) logger.info("Done training") else: # begin validation # begin validation valid_feed.epoch_init(valid_config.batch_size, valid_config.backward_size, valid_config.step_size, shuffle=False, intra_shuffle=False) valid_model.valid("ELBO_VALID", sess, valid_feed) test_feed.epoch_init(valid_config.batch_size, valid_config.backward_size, valid_config.step_size, shuffle=False, intra_shuffle=False) valid_model.valid("ELBO_TEST", sess, test_feed) dest_f = open(os.path.join(log_dir, "test.txt"), "wb") test_feed.epoch_init(test_config.batch_size, test_config.backward_size, test_config.step_size, shuffle=False, intra_shuffle=False) test_model.test_mul_ref(sess, test_feed, num_batch=None, repeat=5, dest=dest_f) dest_f.close()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. saver = tf.train.Saver() step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def cartpole_train(self, rank, env, global_brain, agent, optimizer, global_t, send_rev, args): #global_total_loss = [] o = env.reset() step = 0 sum_rewards = 0 max_sum_rewards = 0 vs = [] entropies = [] sum_rewards = 0 done = True #cnt = 0 while global_t[0] < args.epoch: tmp = global_t.clone().item() + 1 #print('cnt:',cnt) agent.local_brain.sync(global_brain) # local policy にコピー observations = [] actions = [] values = [] rewards = [] probs = [] R = 0 for _ in range(args.local_t_max): global_t += 1 step += 1 # Agentのactで行動を取得 p, v = agent.local_brain(Variable(torch.from_numpy(o).float()).unsqueeze(0)) a = agent.act(o) if len(a.data.squeeze().size()) == 0: o, r, done, _ = env.step(a.data.squeeze().item()) else: o, r, done, _ = env.step(a.data.squeeze()[0]) if r != 1: print('-----------------------------------------------------------------------------------------------') if rank == 0: sum_rewards += r if args.render: env.render() observations.append(o) actions.append(a) values.append(v) rewards.append(r) probs.append(p) if done: o = env.reset() #self.total_reward = 0 if rank == 0: print('----------------------------------') print('total reward of the episode:', sum_rewards) print('----------------------------------') if args.save_mode == 'all': torch.save(agent.local_brain, os.path.join(args.log_dir, args.save_name+"_{}.pkl".format(global_t[0]))) elif args.save_mode == 'last': torch.save(agent.local_brain, os.path.join(args.log_dir, args.save_name+'.pkl')) elif args.save_mode == 'max': if max_sum_rewards < sum_rewards: torch.save(agent.local_brain, os.path.join(args.log_dir, args.save_name+'.pkl')) max_sum_rewards = sum_rewards step = 0 break else: #self.total_reward += r _, v = agent.local_brain(torch.from_numpy(o).unsqueeze(0).float()) R += v.data.squeeze().item() # -- Agent advantage_push_agent.local_brain() --- 割引報酬和の計算 returns = [] for r in rewards[::-1]: # 割引報酬和 R = r + 0.99 * R returns.insert(0, R) returns = torch.Tensor(returns) #if len(returns) > 1: # returns = (returns-returns.mean()) / (returns.std()+args.eps) # -- LocalBrain _build_graph() --- lossの計算 loss, v_loss, entropy, p_loss_list = agent._loss_function(actions, values, probs, returns, args) vs.append(v_loss.data.numpy()) entropies.append(entropy.data.numpy()) self.global_history_reward.append([tmp, sum_rewards]) ## 記録 if rank == 0 and done: logger.record_tabular_misc_stat('Entropy', entropies) logger.record_tabular_misc_stat('V', vs) logger.record_tabular('reward', sum_rewards) logger.record_tabular('step', global_t[0]) logger.dump_tabular() del vs[:] del entropies[:] sum_rewards = 0 # 重みの更新(最後まで) optimizer.zero_grad() final_node = [loss] + p_loss_list #print('final_node',final_node) gradients = [torch.ones(1)] + [None] * len(p_loss_list) #print('gradients',gradients) autograd.backward(final_node, gradients) #print('after_final_node',final_node) #print('after_gradients',gradients) #raise # 学習率の更新 new_lr = np.true_divide(args.epoch - global_t[0] , args.epoch * args.lr) optimizer.step(new_lr) # cnt += 1 send_rev.send(self.global_history_reward)
def main(): logger.configure('{}{}_logs'.format(filePath, envName)) for k, v in C.items(): logger.record_tabular(k, v) logger.dump_tabular() logger.log('MsPacman') #Start the session sess = tf.InteractiveSession() train_env = make_env(C['env_id'], C['noop_max']) eval_env = make_env(C['env_id'], C['noop_max']) #Intitialize variables to record outputs train_track = [0.0] eval_track = [] best_reward = 0 train_reward = tf.placeholder(tf.float32) eval_reward = tf.placeholder(tf.float32) train_env = make_env(C['env_id'], C['noop_max']) eval_env = make_env(C['env_id'], C['noop_max']) agent = Agent(train_env, C) train_fs = reset_fs() train_s = train_env.reset() best_reward = 0 train_mean = [] eval_mean = [] train_summary = tf.summary.scalar('train_reward', train_reward) eval_summary = tf.summary.scalar('eval_reward', eval_reward) writer = tf.summary.FileWriter('{}{}_summary'.format(filePath, envName), sess.graph) sess.run(tf.global_variables_initializer()) agent.net.update_target_network() for it in range(C['iterations']): train_fs.append(train_s) train_a = agent.act(np.transpose(train_fs, (1, 2, 0))) ns, train_r, train_d, _ = train_env.step(train_a) #print('Iteration ',it, ' Reward ', train_r) train_track[-1] += train_r agent.record(train_s, train_a, train_r, float(train_d), it) train_s = ns if train_d: if train_env.env.env.was_real_done: # one env for MsPacman, Freeway (No Fire action) if len(train_track) % 100 == 0: mean = np.mean(train_track[-100:]) train_mean.append(mean) summary = sess.run(train_summary, feed_dict={train_reward: mean}) writer.add_summary(summary, it) logger.record_tabular('steps', it) logger.record_tabular('episode', len(train_track)) logger.record_tabular('epsilon', 100 * agent.epsilon) logger.record_tabular('learning rate', agent.lr) logger.record_tabular('Mean Reward 100 episdoes', mean) logger.dump_tabular() with open(resultPath + 'reward_atari_base.pk1', 'wb') as f: pickle.dump(train_track, f, protocol=pickle.HIGHEST_PROTOCOL) train_track.append(0.0) train_fs = reset_fs() train_s = train_env.reset() if (it + 1) % C['eval_freq'] == 0: for i in range(C['eval_episodes']): temp_video = [] eval_track.append(0.0) eval_fs = reset_fs() eval_s = eval_env.reset() while True: temp_video.append(eval_s) eval_fs.append(eval_s) eval_a = agent.greedy_act(np.transpose(eval_fs, (1, 2, 0))) eval_s, eval_r, eval_d, _ = eval_env.step(eval_a) eval_track[-1] += eval_r if eval_env.env.env.was_real_done: break if eval_d: eval_fs = reset_fs() eval_s = eval_env.reset() if eval_track[-1] > best_reward: best_reward = eval_track[-1] best_video = temp_video with open(resultPath + 'video_atari_base.pk1', 'wb') as f: pickle.dump(best_video, f, protocol=pickle.HIGHEST_PROTOCOL) eval_mean.append(np.mean(eval_track[-C['eval_episodes']:])) summary = sess.run(eval_summary, feed_dict={ eval_reward: np.mean(eval_track[-C['eval_episodes']:]) }) writer.add_summary(summary, it) if it == 1000000: outputs = agent.net.get_outputs(np.transpose(train_fs, (1, 2, 0))) with open(resultPath + 'outputs.pk1', 'wb') as f: pickle.dump(outputs, f, protocol=pickle.HIGHEST_PROTOCOL) with open(resultPath + 'outputs_screen.pk1', 'wb') as f: pickle.dump(train_fs, f, protocol=pickle.HIGHEST_PROTOCOL) with open(resultPath + 'reward_atari_base.pk1', 'wb') as f: pickle.dump(train_track, f, protocol=pickle.HIGHEST_PROTOCOL) with open(resultPath + 'trainMean_atari_base.pk1', 'wb') as f: pickle.dump(train_mean, f, protocol=pickle.HIGHEST_PROTOCOL) with open(resultPath + 'evalMean_atari_base.pk1', 'wb') as f: pickle.dump(eval_mean, f, protocol=pickle.HIGHEST_PROTOCOL) agent.net.save(filePath + '{}_model2'.format(C['env_id'])) sess.close()
def testing( save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=50, nb_rollout_steps=3, reward_scale=1.0, render=False, render_eval=False, # no noise for test # noise_type='adaptive-param_0.2', # noise_type='normal_0.9', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, # **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] # nb_actions = 2*env.grid_size nb_actions = env.grid_size action_shape = np.array(nb_actions * [0]).shape nb_features = (4 + 1) * env.grid_size observation_shape = np.array(nb_features * [0]).shape grid_x = env.grid_x grid_y = env.grid_y x = [] y = [] for i in range(grid_x): x.append(i + 1) for i in range(grid_y): y.append(i + 1) # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None '''no noise for test''' # if noise_type is not None: # for current_noise_type in noise_type.split(','): # current_noise_type = current_noise_type.strip() # if current_noise_type == 'none': # pass # elif 'adaptive-param' in current_noise_type: # _, stddev = current_noise_type.split('_') # param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) # elif 'normal' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # elif 'ou' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # else: # raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. # agent.initialize(sess) # sess.graph.finalize() agent.load(sess, save_path) agent.reset() obs, env_state = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar step_set = [] reward_set = [] epoch = 0 start_time = time.time() epoch_episode_rewards = [] average_reward = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_state = [] epoch_episodes = 0 #record the car numbers in each step car_num_set = {} t_set = [i for i in range(total_timesteps)] for xx in x: for yy in y: lab = str(xx) + str(yy) car_num_set[lab] = [[0 for i in range(total_timesteps)] for j in range(4)] for epoch in range(nb_epochs): obs, env_state = env.reset() epoch_actions = [] epoch_state = [] average_car_num_set = [] last_action = 1 for cycle in range(nb_epoch_cycles): # Perform rollouts. action, q, _, _ = agent.step(obs, apply_noise=False, compute_Q=True) '''random action''' # if np.random.rand()>0.5: # action=[1] # else: # action=[0] '''cycle light state''' # action=[0] '''cycle action (should cycle state instead of action)''' # if last_action==1: # action=[0] # else: # action=[1] # last_action=action[0] if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): new_obs, r, env_state, done = env.step(action, env_state) epoch_state.append(env_state['11'].light_state) for xx in x: for yy in y: lab = str(xx) + str(yy) for i in range(4): car_num_set[lab][i][t] = ( env_state['11'].car_nums[i]) t += 1 episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b = 1. agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: print('done') # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() epoch_episode_rewards.append(episode_reward) average_reward.append(episode_reward / nb_rollout_steps) episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] # for t_train in range(nb_train_steps): # # Adapt param noise, if necessary. # if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: # distance = agent.adapt_param_noise() # epoch_adaptive_distances.append(distance) # # print('Train!') # cl, al = agent.train() # epoch_critic_losses.append(cl) # epoch_actor_losses.append(al) # agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 step_set.append(t) mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) # plt.figure(figsize=(8,5)) '''plot rewards-steps''' ax1 = plt.subplot(2, 1, 1) plt.sca(ax1) plt.plot(step_set, average_reward, color='b') # plt.xlabel('Steps') plt.ylabel('Mean Reward', fontsize=12) # plt.ylim(-15000,0) '''plot queueing car numbers-steps''' ax2 = plt.subplot(2, 1, 2) plt.sca(ax2) print(np.shape(t_set), np.shape(car_num_set['11'][i])) for i in range(4): if i == 0: plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='b') elif i == 1: plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='orange') elif i == 2: plt.plot(t_set, car_num_set['11'][i], label=i, color='g') else: plt.plot(t_set, car_num_set['11'][i], label=i, color='r') plt.ylim(0, 100) #sum among roads sum_car_num = np.sum(car_num_set['11'], axis=0) #average among time steps average_car_num = np.average(sum_car_num) average_car_num_set.append(average_car_num) plt.xlabel('Steps', fontsize=12) plt.ylabel('Cars Numbers', fontsize=12) # set legend handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) leg = plt.legend(by_label.values(), by_label.keys(), loc=1) # leg = plt.legend(loc=4) legfm = leg.get_frame() legfm.set_edgecolor('black') # set legend fame color legfm.set_linewidth(0.5) # set legend fame linewidth plt.savefig('ddpg_mean_test.pdf') plt.show() print(epoch_state) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) print('average queueing car numbers: ', np.average(average_car_num_set)) return agent
def train(args): config = tf.ConfigProto() config.gpu_options.allow_growth = True dataset = pickle.load( open("data/" + args.expert_file + "_" + str(args.num_sampled), "rb")) dataset.min_reward = 0 dataset.max_reward = 1 action_getter = utils.ActionGetter( atari.env.action_space.n, replay_memory_start_size=REPLAY_MEMORY_START_SIZE, max_frames=MAX_FRAMES, eps_initial=args.initial_exploration) utils.generate_weights(dataset) saver = tf.train.Saver(max_to_keep=10) sess = tf.Session(config=config) sess.run(init) fixed_state = np.expand_dims(atari.fixed_state(sess), axis=0) if args.checkpoint_index >= 0: saver.restore( sess, args.checkpoint_dir + args.env_id + "/" + "seed_" + str(args.seed) + "/" + "model--" + str(args.checkpoint_index)) print( "Loaded Model ... ", args.checkpoint_dir + args.env_id + "seed_" + str(args.seed) + "/" + "model--" + str(args.checkpoint_index)) logger.configure(args.log_dir + args.env_id + "/" + "seed_" + str(args.seed) + "/") if not os.path.exists(args.gif_dir + args.env_id + "/" + "seed_" + str(args.seed) + "/"): os.makedirs(args.gif_dir + args.env_id + "/" + "seed_" + str(args.seed) + "/") if not os.path.exists(args.checkpoint_dir + args.env_id + "/" + "seed_" + str(args.seed) + "/"): os.makedirs(args.checkpoint_dir + args.env_id + "/" + "seed_" + str(args.seed) + "/") frame_number = 0 loss_list = [] epoch = 0 while frame_number < MAX_FRAMES: print("Training Model ...") epoch_frame = 0 start_time = time.time() for j in tqdm(range(EVAL_FREQUENCY // BS)): loss = learn(sess, dataset, MAIN_DQN, TARGET_DQN, BS, gamma=DISCOUNT_FACTOR) # (8★) loss_list.append(loss) # Output the progress: # logger.log("Runing frame number {0}".format(frame_number)) logger.record_tabular("frame_number", frame_number) logger.record_tabular("td loss", np.mean(loss_list[-100:])) q_vals = sess.run(MAIN_DQN.action_prob, feed_dict={MAIN_DQN.input: fixed_state}) for i in range(atari.env.action_space.n): logger.record_tabular("q_val action {0}".format(i), q_vals[0, i]) utils.test_q_values(sess, dataset, atari, action_getter, MAIN_DQN, MAIN_DQN.input, MAIN_DQN.action_prob_expert, BS) print("Current Frame: ", frame_number) print("TD Loss: ", np.mean(loss_list[-100:])) # Evaluation ... gif = True frames_for_gif = [] eval_rewards = [] evaluate_frame_number = 0 print("Evaluating Model.... ") while evaluate_frame_number < EVAL_STEPS: terminal_life_lost = atari.reset(sess, evaluation=True) episode_reward_sum = 0 for _ in range(MAX_EPISODE_LENGTH): # Fire (action 1), when a life was lost or the game just started, # so that the agent does not stand around doing nothing. When playing # with other environments, you might want to change this... action = 1 if terminal_life_lost and args.env_id == "BreakoutDeterministic-v4" else action_getter.get_action( sess, frame_number, atari.state, MAIN_DQN, evaluation=True) processed_new_frame, reward, terminal, terminal_life_lost, new_frame = atari.step( sess, action) evaluate_frame_number += 1 episode_reward_sum += reward if gif: frames_for_gif.append(new_frame) if terminal: eval_rewards.append(episode_reward_sum) gif = False # Save only the first game of the evaluation as a gif break if len(eval_rewards) % 10 == 0: print("Evaluation Completion: ", str(evaluate_frame_number) + "/" + str(EVAL_STEPS)) print("Evaluation score:\n", np.mean(eval_rewards)) try: utils.generate_gif( frame_number, frames_for_gif, eval_rewards[0], args.gif_dir + args.env_id + "/" + "seed_" + str(args.seed) + "/") except IndexError: print("No evaluation game finished") logger.log("Average Evaluation Reward", np.mean(eval_rewards)) logger.log("Average Sequence Length", evaluate_frame_number / len(eval_rewards)) # Save the network parameters saver.save(sess, args.checkpoint_dir + args.env_id + "/" + "seed_" + str(args.seed) + "/" + 'model-', global_step=frame_number) print("Runtime: ", time.time() - start_time) print("Epoch: ", epoch, "Total Frames: ", frame_number) epoch += 1 logger.dumpkvs()
def train(env, nb_epochs, nb_epoch_cycles, normalize_observations, actor_lr, critic_lr, action_noise, gamma, nb_train_steps, nb_rollout_steps, batch_size, memory, tau=0.01): max_action = env.action_space.high agent = DDPG(memory, env.observation_space.shape[0], env.action_space.shape[0], gamma=gamma, tau=tau, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, actor_lr=actor_lr, critic_lr=critic_lr, ) if USE_CUDA: agent.cuda() # Set up logging stuff only for a single worker. step = 0 episode = 0 episode_rewards_history = deque(maxlen=100) # Prepare everything. agent.reset() obs = env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) t += 1 episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] for t_train in range(nb_train_steps): cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time combined_stats = dict() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('')
def process_samples(self, itr, paths): baselines = [] returns = [] all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = misc_utils.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = misc_utils.discount_cumsum( path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) # a trick to reduce variance but gives biased gradient path["value_targets"] = path["advantages"] + np.array( path_baselines[:-1]) ev = misc_utils.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate([path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def obtain_samples(self, itr, dynamics=None): logger.info("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() # dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) if dynamics: rewards = dynamics.process_rewards(rewards, obses, actions, next_obses) env_time += time.time() - t agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, obs, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(obs) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.algo.env.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.algo.env.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t obses = next_obses logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths, n_samples
def cartpole_train_3_5(self, rank, args): torch.manual_seed(args.seed + rank) self.agent.local_brain.train() step = 0 sum_rewards = 0 max_sum_rewards = 0 vs = [] entropies = [] cnt = 0 while self.g_ep.value < args.epoch: #tmp = 0 o = self.env.reset() #o = torch.from_numpy(state) #print('cnt:',cnt) # self.agent.local_brain.sync(self.global_brain) # local policy にコピー observations, actions, values, rewards, probs = [], [], [], [], [] #R = 0 #done = True ep_r = 0. while True: step += 1 # Agentのactで行動を取得 p, v = self.agent.local_brain(Variable(torch.from_numpy(o).float()).unsqueeze(0)) a = self.agent.act(o) if len(a.data.squeeze().size()) == 0: o, r, done, _ = self.env.step(a.data.squeeze().item()) else: o, r, done, _ = self.env.step(a.data.squeeze()[0]) if done: r = -1 if rank == 0: sum_rewards += r if args.render: self.env.render() ep_r += r observations.append(o) actions.append(a) values.append(v) rewards.append(r) probs.append(p) if step % args.local_t_max == 0 or done: if done: R = 0 else: _, v = self.agent.local_brain(torch.from_numpy(observations[-1]).unsqueeze(0).float()) R = v.data.squeeze().item() returns = [] for r in rewards[::-1]: # 割引報酬和 R = r + 0.99 * R returns.insert(0, R) returns = torch.Tensor(returns) loss, v_loss, entropy, _ = self.agent._loss_function(actions, values, probs, returns, args) vs.append(v_loss.data.numpy()) entropies.append(entropy.data.numpy()) ## 記録 if rank == 0 and done: logger.record_tabular_misc_stat('Entropy', entropies) logger.record_tabular_misc_stat('V', vs) logger.record_tabular('reward', sum_rewards) logger.record_tabular('step', self.g_ep.value) logger.dump_tabular() del vs[:] del entropies[:] self.optimizer.zero_grad() loss.backward(retain_graph=True) for lp, gp in zip(self.agent.local_brain.parameters(), self.global_brain.parameters()): gp._grad = lp.grad self.optimizer.step() self.agent.local_brain.sync(self.global_brain) # local policy にコピー observations, actions, values, rewards, probs = [], [], [], [], [] if done: with self.g_ep.get_lock(): self.g_ep.value += 1 with self.g_ep_r.get_lock(): if self.g_ep_r.value == 0.: self.g_ep_r.value = ep_r else: self.g_ep_r.value = self.g_ep_r.value * 0.99 + ep_r * 0.01 self.res_queue.put(self.g_ep_r.value) o = self.env.reset() #self.global_history_reward.append([tmp, self.total_reward]) self.total_reward = 0 if rank == 0: print('----------------------------------') print('total reward of the episode:', sum_rewards) print('----------------------------------') if args.save_mode == 'all': torch.save(self.agent.local_brain, os.path.join(args.log_dir, args.save_name+"_{}.pkl".format(self.g_ep.value))) elif args.save_mode == 'last': torch.save(self.agent.local_brain, os.path.join(args.log_dir, args.save_name+'.pkl')) elif args.save_mode == 'max': if max_sum_rewards < sum_rewards: torch.save(self.agent.local_brain, os.path.join(args.log_dir, args.save_name+'.pkl')) max_sum_rewards = sum_rewards #step = 0 sum_rewards = 0 break #raise # 学習率の更新 # new_lr = np.true_divide(args.epoch - global_t[0] , args.epoch * args.lr) # self.optimizer.step(new_lr) cnt += 1 #send_rev.send(self.global_history_reward) self.res_queue.put(None)
def train(policy, planner, rollout_worker, evaluator, n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval, save_path, **kwargs): rank = MPI.COMM_WORLD.Get_rank() if save_path: latest_mdl_path = save_path + '_latest' best_mdl_path = save_path periodic_policy_path = save_path + '_{}' best_success_rate = -1 logger.info('Training......') # num_timesteps = n_epochs * n_cycles * rollout_length * number of rollout workers for epoch in range(n_epochs): logger.info('========== epoch {} ========='.format(epoch)) logger.record_tabular('epoch', epoch) # train rollout_worker.clear_history() for _ in range(n_cycles): # logger.info('collect rollouts...') episode_for_act, episode_for_pln = rollout_worker.generate_rollouts( cur_progress=(epoch / n_epochs)) # logger.info('store rollouts for policy') policy.store_episode(episode_for_act) # logger.info('store rollouts for planner, episodes_for_pln shape:', episode_for_pln.shape) planner.store_episode(episode_for_pln) # logger.info('training policy') for _ in range(n_batches): policy.train() policy.update_target_net() # logger.info('training planner') for _ in range(n_batches): planner.train(use_buffer=True) # test # logger.info("evaluate...") evaluator.clear_history() for ro in range(n_test_rollouts): evaluator.generate_rollouts() for key, val in evaluator.logs('test'): logger.record_tabular(key, mpi_average(val)) for key, val in rollout_worker.logs('train'): logger.record_tabular(key, mpi_average(val)) for key, val in policy.logs(): logger.record_tabular(key, mpi_average(val)) for key, val in planner.logs(): logger.record_tabular(key, mpi_average(val)) if rank == 0: logger.dump_tabular() success_rate = mpi_average(evaluator.current_success_rate()) if rank == 0 and success_rate >= best_success_rate and save_path: best_success_rate = success_rate # logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path)) # evaluator.save_policy(latest_mdl_path) logger.info( 'Saving best policy+planner to {} ...'.format(best_mdl_path)) evaluator.save_policy(best_mdl_path) evaluator.save_planner(best_mdl_path) if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_path: # policy_path = periodic_policy_path.format(epoch) logger.info('Saving lastest policy+planner to {} ...'.format( latest_mdl_path)) evaluator.save_policy(latest_mdl_path) evaluator.save_planner(latest_mdl_path) elif rank == 0 and policy_save_interval < 0 and epoch % ( -policy_save_interval) == 0 and save_path: periodic_mdl_path = periodic_policy_path.format(epoch) logger.info('Saving periodic policy+planner to {} ...'.format( periodic_mdl_path)) evaluator.save_policy(periodic_mdl_path) evaluator.save_planner(periodic_mdl_path) local_uniform = np.random.uniform(size=(1, )) root_uniform = local_uniform.copy() MPI.COMM_WORLD.Bcast(root_uniform, root=0) if rank != 0: assert local_uniform[0] != root_uniform[0] return policy, planner
def main(): logger.configure('logs/long_short') global T, PRIORITY, DICHOTOMY, ENTROPY for PRIORITY, DICHOTOMY, ENTROPY in [ (True, False, True), (True, False, False), (False, True, False), (False, False, False), ]: income_means, income_stds = [], [] short_ratios, long_ratios = [], [] short_passengers, long_passengers = [], [] for seed in range(N_RUNS): np.random.seed(seed) T = 0 g_lanes.clear() g_lanes.update({'short': Lane(), 'long': Lane()}) # short_passengers, long_passengers = [], [] enter_passengers = np.random.poisson(0.1, size=LENGTH) g_taxis.clear() for i in range(N_TAXIS // 2): g_taxis.append(Taxi(i)) enter(g_taxis[-1], g_lanes['short']) for i in range(N_TAXIS // 2): g_taxis.append(Taxi(i + N_TAXIS // 2)) enter(g_taxis[-1], g_lanes['long']) while T < LENGTH: if enter_passengers[T]: dist = max( 2, np.random.choice(range(len(DISTANCES)), p=DISTANCES)) p = Passenger(dist) if not DICHOTOMY: lane = RANDOM_LANE() elif p.distance <= THRESHOLD: lane = g_lanes['short'] else: lane = g_lanes['long'] lane.passengers.append(p) g_lanes['short'].step() g_lanes['long'].step() for taxi in g_taxis: taxi.step() short_passengers.append(len(g_lanes['short'].passengers)) long_passengers.append(len(g_lanes['long'].passengers)) T += 1 incomes = [np.sum(t.incomes) for t in g_taxis] income_means.append(np.mean(incomes)) income_stds.append(np.std(incomes)) short_ratios.append( np.mean([r for t in g_taxis for r in t.income_ratio['short']])) long_ratios.append( np.mean([r for t in g_taxis for r in t.income_ratio['long']])) # logger.info(income_means) # logger.info(income_stds) logger.record_tabular('*priority*', PRIORITY) logger.record_tabular('*dichotomy*', DICHOTOMY) logger.record_tabular('*entropy*', ENTROPY) logger.record_tabular('income mean', np.mean(income_means)) logger.record_tabular('income std', np.mean(income_stds)) logger.record_tabular('queuing time mean', np.mean([t.queue_time for t in g_taxis])) logger.record_tabular('short income ratio mean', np.mean(short_ratios) * 3600) logger.record_tabular('short income ratio std', np.std(short_ratios) * 3600) logger.record_tabular('long income ratio mean', np.mean(long_ratios) * 3600) logger.record_tabular('long income ratio std', np.std(long_ratios) * 3600) logger.record_tabular('# short lane passengers', np.mean(short_passengers)) logger.record_tabular('# long lane passengers', np.mean(long_passengers)) logger.dump_tabular()
def learn(env, sess, seed, nsteps=5, total_timesteps=int(80e4), discount=0.5, entropy_coeff=0.01, lr=7e-4, lr_decay=0.99, fuzz_factor=0.00001, max_grad_norm=0.5, log_interval=100): env.init() action_set = env.getActionSet() n_actions = len(action_set) state_dim = env.getGameState().size # Reset environment total_returns = [] # Init actorCritic actor_critic = ActorCritic(state_dim, n_actions, nsteps, discount, entropy_coeff, lr, lr_decay, fuzz_factor, total_timesteps, max_grad_norm) sim = Simulation(env, actor_critic, nsteps=nsteps, discount=discount) sim.start_episode() e_cnt = 0 for nupdate in range(int(total_timesteps / nsteps)): if env.game_over(): # done = True total_returns.append(sim.total_return) sim.start_episode() e_cnt = e_cnt + 1 # Collect n-step trajectories obs, rewards, actions, values, dones, states = sim.run_nsteps() # Update train_model policy_loss, value_loss, policy_entropy, a_dist = \ actor_critic.train(obs, actions, rewards, values, dones, states) # print('action probs:') # print(ap[0], a) if nupdate % log_interval == 0 or nupdate == 1: # ev = explained_variance(values, rewards) logger.record_tabular("nupdates", nupdate) logger.record_tabular("nepisode", e_cnt) # logger.record_tabular("total_timesteps", nupdate * nsteps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular( "avg. total return", np.mean(total_returns[-(min(len(total_returns), 100)):])) # logger.record_tabular("explained_variance", float(ev)) logger.dump_tabular() return actor_critic
def learn(env, policy_func, reward_giver, reward_guidance, expert_dataset, rank, pretrained, pretrained_weight, *, g_step, d_step, entcoeff, save_per_iter, ckpt_dir, log_dir, timesteps_per_batch, task_name, gamma, lam, algo, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=1e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, loss_percent=0.0, callback=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space policy = build_policy(env, 'mlp', value_network='copy') ob = observation_placeholder(ob_space) with tf.variable_scope('pi'): pi = policy(observ_placeholder=ob) with tf.variable_scope('oldpi'): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables('pi') # var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")] # vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") # assert len(var_list) == len(vf_var_list) + 1 d_adam = MpiAdam(reward_giver.get_trainable_variables()) guidance_adam = MpiAdam(reward_guidance.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables('oldpi'), get_variables('pi')) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() guidance_adam.sync() vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, reward_giver, reward_guidance, timesteps_per_batch, stochastic=True, algo=algo, loss_percent=loss_percent) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(reward_giver.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("********** Iteration %i ************" % iters_so_far) # global flag_render # if iters_so_far > 0 and iters_so_far % 10 ==0: # flag_render = True # else: # flag_render = False def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() print('rewards', seg['rew']) add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch( batch_size=len(ob)) batch_size = 128 d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch with timed("Discriminator"): for (ob_batch, ac_batch) in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch( batch_size=batch_size) # update running mean/std for reward_giver if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ob_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) # ------------------ Update Guidance ------------ logger.log("Optimizing Guidance...") logger.log(fmt_row(13, reward_guidance.loss_name)) batch_size = 128 guidance_losses = [ ] # list of tuples, each of which gives the loss for a minibatch with timed("Guidance"): for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch( batch_size=batch_size) idx_condition = process_expert(ob_expert, ac_expert) pick_idx = (idx_condition >= loss_percent) # pick_idx = idx_condition ob_expert_p = ob_expert[pick_idx] ac_expert_p = ac_expert[pick_idx] ac_batch_p = [] for each_ob in ob_expert_p: tmp_ac, _, _, _ = pi.step(each_ob, stochastic=True) ac_batch_p.append(tmp_ac) # update running mean/std for reward_giver if hasattr(reward_guidance, "obs_rms"): reward_guidance.obs_rms.update(ob_expert_p) # reward_guidance.train(expert_s=ob_batch_p, agent_a=ac_batch_p, expert_a=ac_expert_p) *newlosses, g = reward_guidance.lossandgrad( ob_expert_p, ac_batch_p, ac_expert_p) guidance_adam.update(allmean(g), d_stepsize) guidance_losses.append(newlosses) logger.log(fmt_row(13, np.mean(guidance_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) * g_step iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular()
def train(rank, global_policy, local_policy, optimizer, env, global_t, args): o = env.reset() step = 0 sum_rewards = 0 max_sum_rewards = 0 vs = [] entropies = [] sum_rewards = 0 while global_t[0] < args.epoch: local_policy.sync(global_policy) observations = [] actions = [] values = [] rewards = [] probs = [] R = 0 for i in range(args.local_t_max): global_t += 1 step += 1 p, v = local_policy( Variable(torch.from_numpy(o).float()).unsqueeze(0)) a = p.multinomial() o, r, done, _ = env.step(a.data.squeeze()[0]) if rank == 0: sum_rewards += r if args.render: env.render() observations.append(o) actions.append(a) values.append(v) rewards.append(r) probs.append(p) if done: o = env.reset() if rank == 0: print('----------------------------------') print('total reward of the episode:', sum_rewards) print('----------------------------------') if args.save_mode == 'all': torch.save( local_policy, os.path.join( args.log_dir, args.save_name + "_{}.pkl".format(global_t[0]))) elif args.save_mode == 'last': torch.save( local_policy, os.path.join(args.log_dir, args.save_name + '.pkl')) elif args.save_mode == 'max': if max_sum_rewards < sum_rewards: torch.save( local_policy, os.path.join(args.log_dir, args.save_name + '.pkl')) max_sum_rewards = sum_rewards step = 0 break else: _, v = local_policy( Variable(torch.from_numpy(o).unsqueeze(0).float())) R += v.data.squeeze()[0] returns = [] for r in rewards[::-1]: R = r + 0.99 * R returns.insert(0, R) returns = torch.Tensor(returns) #if len(returns) > 1: # returns = (returns-returns.mean()) / (returns.std()+args.eps) v_loss = 0 entropy = 0 for a, v, p, r in zip(actions, values, probs, returns): a.reinforce(r - v.data.squeeze()) _v_loss = nn.MSELoss()(v, Variable(torch.Tensor([r]))) v_loss += _v_loss entropy += -(p * (p + args.eps).log()).sum() v_loss = v_loss * 0.5 * args.v_loss_coeff entropy = entropy * args.entropy_beta loss = v_loss - entropy vs.append(v_loss.data.numpy()) entropies.append(entropy.data.numpy()) if rank == 0 and done: logger.record_tabular_misc_stat('Entropy', entropies) logger.record_tabular_misc_stat('V', vs) logger.record_tabular('reward', sum_rewards) logger.record_tabular('step', global_t[0]) logger.dump_tabular() del vs[:] del entropies[:] sum_rewards = 0 optimizer.zero_grad() final_node = [loss] + actions gradients = [torch.ones(1)] + [None] * len(actions) autograd.backward(final_node, gradients) new_lr = (args.epoch - global_t[0]) / args.epoch * args.lr optimizer.step(new_lr)
weights, batch_indxes = np.ones_like(rewards), None obses_t, obses_tp1 = tf.constant(obses_t), tf.constant(obses_tp1) actions, rewards, dones = tf.constant( actions, dtype=tf.int64), tf.constant(rewards), tf.constant(dones) weights = tf.constant(weights) td_errors = agent.train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. agent.update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) # todo 每一个episode记录一次 if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() plt.figure() plt.plot(len(duration), duration) plt.figure() plt.plot(len(episode_rewards), episode_rewards) plt.show()
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() U.load_state("save/Humanoid-v1") # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] #if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() U.save_state("save/Humanoid-v1")
def retraining( save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=4, #50 nb_rollout_steps=3, #100 reward_scale=1.0, render=False, render_eval=False, # noise_type='adaptive-param_0.2', noise_type='normal_0.2', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-4, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, #100 batch_size=640, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, #50 **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] nb_actions = env.num_actions # nb_actions=3 # print(nb_actions) action_shape = np.array(nb_actions * [0]).shape #4 pairs pos + 3 link length # nb_features = 2*(env.num_actions+1)+env.num_actions #4 pairs pos + 1 pair target pos nb_features = 2 * (env.num_actions + 2) observation_shape = np.array(nb_features * [0]).shape # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None # nb_actions = env.action_space.shape[-1] if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) # sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar step_set = [] reward_set = [] epoch = 0 start_time = time.time() epoch_episode_rewards = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 #load the initialization policy agent.load_ini(sess, save_path) # agent.memory.clear(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) for epoch in range(nb_epochs): print(nb_epochs) # obs, env_state = env.reset() obs = env.reset() agent.save(save_path) epoch_episode_rewards = [] '''check if the actor initialization policy has been loaded correctly, i.e. equal to directly ouput values in checkpoint files ''' # loaded_weights=tf.get_default_graph().get_tensor_by_name('target_actor/mlp_fc0/w:0') # print('loaded_weights:', sess.run(loaded_weights)) for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) print('action:', action) new_obs, r, done = env.step(action) # time.sleep(0.2) t += 1 episode_reward += r episode_step += 1 # print('episode_re: ', episode_reward) #[1.] # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b = 1. agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs epoch_episode_rewards.append(episode_reward) episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) # print('Train!') cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) step_set.append(t) plt.plot(step_set, mean_epoch_episode_rewards, color='r', label='Initialization') plt.xlabel('Steps') plt.ylabel('Mean Episode Reward') plt.savefig('ddpg_mean_retrain.png') # plt.show() # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) print('stepset: ', step_set) print('rewards: ', mean_epoch_episode_rewards) return agent
def train(env, nb_epochs, nb_epoch_cycles, normalize_observations, actor_lr, critic_lr, action_noise, gamma, nb_train_steps, nb_rollout_steps, batch_size, memory, tau=0.01): max_action = env.action_space.high agent = DDPG( memory, env.observation_space.shape[0], env.action_space.shape[0], gamma=gamma, tau=tau, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, actor_lr=actor_lr, critic_lr=critic_lr, ) if USE_CUDA: agent.cuda() # Set up logging stuff only for a single worker. step = 0 episode = 0 episode_rewards_history = deque(maxlen=100) # Prepare everything. agent.reset() obs = env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi( obs, apply_noise=True, compute_Q=True) # policy 로 부터 action 을 선택하는 assert action.shape == env.action_space.shape # Execute next action. assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) # 환경 스탭 t += 1 episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] for t_train in range(nb_train_steps): cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time combined_stats = dict() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('')
def eval(data, model, meta_optimizer): model.eval() criterion = nn.NLLLoss().cuda() num_sents = 0 num_words = 0 total_nll_autoreg = 0. total_nll_vae = 0. total_kl_vae = 0. total_nll_svi = 0. total_kl_svi = 0. best_svi_loss = 0. for i in range(len(data)): sents, length, batch_size = data[i] num_words += batch_size * length num_sents += batch_size if args.gpu >= 0: sents = sents.cuda() if args.model == 'autoreg': preds = model._dec_forward(sents, None, True) nll_autoreg = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(length) ]) total_nll_autoreg += nll_autoreg.data[0] * batch_size elif args.model == 'svi': mean_svi = Variable( 0.1 * torch.randn(batch_size, args.latent_dim).cuda(), requires_grad=True) logvar_svi = Variable( 0.1 * torch.randn(batch_size, args.latent_dim).cuda(), requires_grad=True) var_params_svi = meta_optimizer.forward([mean_svi, logvar_svi], sents) mean_svi_final, logvar_svi_final = var_params_svi z_samples = model._reparameterize(mean_svi_final.detach(), logvar_svi_final.detach()) preds = model._dec_forward(sents, z_samples) nll_svi = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(length) ]) total_nll_svi += nll_svi.data[0] * batch_size kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final) total_kl_svi += kl_svi.data[0] * batch_size mean, logvar = mean_svi_final, logvar_svi_final else: mean, logvar = model._enc_forward(sents) z_samples = model._reparameterize(mean, logvar) preds = model._dec_forward(sents, z_samples) nll_vae = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(length) ]) total_nll_vae += nll_vae.data[0] * batch_size kl_vae = utils.kl_loss_diag(mean, logvar) total_kl_vae += kl_vae.data[0] * batch_size if args.model == 'savae': mean_svi = Variable(mean.data, requires_grad=True) logvar_svi = Variable(logvar.data, requires_grad=True) var_params_svi = meta_optimizer.forward([mean_svi, logvar_svi], sents) mean_svi_final, logvar_svi_final = var_params_svi z_samples = model._reparameterize(mean_svi_final, logvar_svi_final) preds = model._dec_forward(sents, z_samples) nll_svi = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(length) ]) total_nll_svi += nll_svi.data[0] * batch_size kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final) total_kl_svi += kl_svi.data[0] * batch_size mean, logvar = mean_svi_final, logvar_svi_final nll_autoreg = total_nll_autoreg / num_sents ppl_autoreg = np.exp(total_nll_autoreg / num_words) nll_vae = (total_nll_vae + total_kl_vae) / num_sents rec_vae = total_nll_vae / num_sents kl_vae = total_kl_vae / num_sents ppl_bound_vae = np.exp((total_nll_vae + total_kl_vae) / num_words) nll_svi = (total_nll_svi + total_kl_svi) / num_sents rec_svi = total_nll_svi / num_sents kl_svi = total_kl_svi / num_sents ppl_bound_svi = np.exp((total_nll_svi + total_kl_svi) / num_words) logger.record_tabular('AR NLL', nll_autoreg) logger.record_tabular('AR PPL', ppl_autoreg) logger.record_tabular('VAE NLL', nll_vae) logger.record_tabular('VAE REC', rec_vae) logger.record_tabular('VAE KL', kl_vae) logger.record_tabular('VAE PPL', ppl_bound_vae) logger.record_tabular('SVI NLL', nll_svi) logger.record_tabular('SVI REC', rec_svi) logger.record_tabular('SVI KL', kl_svi) logger.record_tabular('SVI PPL', ppl_bound_svi) logger.dump_tabular() logger.info( 'AR NLL: %.4f, AR PPL: %.4f, VAE NLL: %.4f, VAE REC: %.4f, VAE KL: %.4f, VAE PPL: %.4f, SVI NLL: %.4f, SVI REC: %.4f, SVI KL: %.4f, SVI PPL: %.4f' % (nll_autoreg, ppl_autoreg, nll_vae, rec_vae, kl_vae, ppl_bound_vae, nll_svi, rec_svi, kl_svi, ppl_bound_svi)) model.train() if args.model == 'autoreg': return ppl_autoreg elif args.model == 'vae': return ppl_bound_vae elif args.model == 'savae' or args.model == 'svi': return ppl_bound_svi
def learn( env, model_path, data_path, policy_fn, *, horizon=150, # timesteps per actor per update rolloutSize=50, clip_param=0.2, entcoeff=0.02, # clipping parameter epsilon, entropy coeff optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=32, # optimization hypers gamma=0.99, lam=0.95, # advantage estimation max_iters=0, # time constraint adam_epsilon=1e-4, schedule='constant', # annealing for stepsize parameters (epsilon and adam) retrain=False): # Setup losses and policy ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=5) # rolling buffer for episode lengths rewbuffer = deque(maxlen=5) # rolling buffer for episode rewards p = [] # for saving the rollouts if retrain == True: print("Retraining the policy from saved path") time.sleep(2) U.load_state(model_path) max_timesteps = int(horizon * rolloutSize * max_iters) while True: if max_iters and iters_so_far >= max_iters: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) print("Collecting samples for policy optimization !! ") if iters_so_far > 70: render = True else: render = False rollouts = sample_trajectory(pi, env, horizon=horizon, rolloutSize=rolloutSize, stochastic=True, render=render) # Save rollouts data = {'rollouts': rollouts} p.append(data) del data data_file_name = data_path + 'rollout_data.pkl' pickle.dump(p, open(data_file_name, "wb")) add_vtarg_and_adv(rollouts, gamma, lam) ob, ac, atarg, tdlamret = rollouts["ob"], rollouts["ac"], rollouts[ "adv"], rollouts["tdlamret"] atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), deterministic=pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("Success", rollouts["success"]) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return pi
def main(args): np.random.seed(args.seed) torch.manual_seed(args.seed) train_data = Dataset(args.train_file) val_data = Dataset(args.val_file) test_data = Dataset(args.test_file) train_sents = train_data.batch_size.sum() vocab_size = int(train_data.vocab_size) logger.info('Train data: %d batches' % len(train_data)) logger.info('Val data: %d batches' % len(val_data)) logger.info('Test data: %d batches' % len(test_data)) logger.info('Word vocab size: %d' % vocab_size) checkpoint_dir = args.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) suffix = "%s_%s.pt" % (args.model, 'cyc') checkpoint_path = os.path.join(checkpoint_dir, suffix) if args.slurm == 0: cuda.set_device(args.gpu) if args.train_from == '': model = RNNVAE(vocab_size=vocab_size, enc_word_dim=args.enc_word_dim, enc_h_dim=args.enc_h_dim, enc_num_layers=args.enc_num_layers, dec_word_dim=args.dec_word_dim, dec_h_dim=args.dec_h_dim, dec_num_layers=args.dec_num_layers, dec_dropout=args.dec_dropout, latent_dim=args.latent_dim, mode=args.model) for param in model.parameters(): param.data.uniform_(-0.1, 0.1) else: logger.info('loading model from ' + args.train_from) checkpoint = torch.load(args.train_from) model = checkpoint['model'] logger.info("model architecture") print(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) if args.warmup == 0: args.beta = 1. else: args.beta = 0.1 criterion = nn.NLLLoss() model.cuda() criterion.cuda() model.train() def variational_loss(input, sents, model, z=None): mean, logvar = input z_samples = model._reparameterize(mean, logvar, z) preds = model._dec_forward(sents, z_samples) nll = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(preds.size(1)) ]) kl = utils.kl_loss_diag(mean, logvar) return nll + args.beta * kl update_params = list(model.dec.parameters()) meta_optimizer = OptimN2N(variational_loss, model, update_params, eps=args.eps, lr=[args.svi_lr1, args.svi_lr2], iters=args.svi_steps, momentum=args.momentum, acc_param_grads=args.train_n2n == 1, max_grad_norm=args.svi_max_grad_norm) if args.test == 1: args.beta = 1 test_data = Dataset(args.test_file) eval(test_data, model, meta_optimizer) exit() t = 0 best_val_nll = 1e5 best_epoch = 0 val_stats = [] epoch = 0 while epoch < args.num_epochs: start_time = time.time() epoch += 1 logger.info('Starting epoch %d' % epoch) train_nll_vae = 0. train_nll_autoreg = 0. train_kl_vae = 0. train_nll_svi = 0. train_kl_svi = 0. train_kl_init_final = 0. num_sents = 0 num_words = 0 b = 0 tmp = float((epoch - 1) % args.cycle) / args.cycle cur_lr = args.lr * 0.5 * (1 + np.cos(tmp * np.pi)) for param_group in optimizer.param_groups: param_group['lr'] = cur_lr if (epoch - 1) % args.cycle == 0: args.beta = 0.1 logger.info('KL annealing restart') for i in np.random.permutation(len(train_data)): if args.warmup > 0: args.beta = min( 1, args.beta + 1. / (args.warmup * len(train_data))) sents, length, batch_size = train_data[i] if args.gpu >= 0: sents = sents.cuda() b += 1 optimizer.zero_grad() if args.model == 'autoreg': preds = model._dec_forward(sents, None, True) nll_autoreg = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(length) ]) train_nll_autoreg += nll_autoreg.data[0] * batch_size nll_autoreg.backward() elif args.model == 'svi': mean_svi = Variable( 0.1 * torch.zeros(batch_size, args.latent_dim).cuda(), requires_grad=True) logvar_svi = Variable( 0.1 * torch.zeros(batch_size, args.latent_dim).cuda(), requires_grad=True) var_params_svi = meta_optimizer.forward( [mean_svi, logvar_svi], sents, b % args.print_every == 0) mean_svi_final, logvar_svi_final = var_params_svi z_samples = model._reparameterize(mean_svi_final.detach(), logvar_svi_final.detach()) preds = model._dec_forward(sents, z_samples) nll_svi = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(length) ]) train_nll_svi += nll_svi.data[0] * batch_size kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final) train_kl_svi += kl_svi.data[0] * batch_size var_loss = nll_svi + args.beta * kl_svi var_loss.backward(retain_graph=True) else: mean, logvar = model._enc_forward(sents) z_samples = model._reparameterize(mean, logvar) preds = model._dec_forward(sents, z_samples) nll_vae = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(length) ]) train_nll_vae += nll_vae.data[0] * batch_size kl_vae = utils.kl_loss_diag(mean, logvar) train_kl_vae += kl_vae.data[0] * batch_size if args.model == 'vae': vae_loss = nll_vae + args.beta * kl_vae vae_loss.backward(retain_graph=True) if args.model == 'savae': var_params = torch.cat([mean, logvar], 1) mean_svi = Variable(mean.data, requires_grad=True) logvar_svi = Variable(logvar.data, requires_grad=True) var_params_svi = meta_optimizer.forward( [mean_svi, logvar_svi], sents, b % args.print_every == 0) mean_svi_final, logvar_svi_final = var_params_svi z_samples = model._reparameterize(mean_svi_final, logvar_svi_final) preds = model._dec_forward(sents, z_samples) nll_svi = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(length) ]) train_nll_svi += nll_svi.data[0] * batch_size kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final) train_kl_svi += kl_svi.data[0] * batch_size var_loss = nll_svi + args.beta * kl_svi var_loss.backward(retain_graph=True) if args.train_n2n == 0: if args.train_kl == 1: mean_final = mean_svi_final.detach() logvar_final = logvar_svi_final.detach() kl_init_final = utils.kl_loss( mean, logvar, mean_final, logvar_final) train_kl_init_final += kl_init_final.data[ 0] * batch_size kl_init_final.backward(retain_graph=True) else: vae_loss = nll_vae + args.beta * kl_vae var_param_grads = torch.autograd.grad( vae_loss, [mean, logvar], retain_graph=True) var_param_grads = torch.cat(var_param_grads, 1) var_params.backward(var_param_grads, retain_graph=True) else: var_param_grads = meta_optimizer.backward( [mean_svi_final.grad, logvar_svi_final.grad], b % args.print_every == 0) var_param_grads = torch.cat(var_param_grads, 1) var_params.backward(var_param_grads) if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm) optimizer.step() num_sents += batch_size num_words += batch_size * length if b % args.print_every == 0: param_norm = sum([p.norm()**2 for p in model.parameters()]).data[0]**0.5 logger.info( 'Iters: %d, Epoch: %d, Batch: %d/%d, LR: %.4f, TrainARNLL: %.4f, TrainARPPL: %.2f, TrainVAE_NLL: %.4f, TrainVAE_REC: %.4f, TrainVAE_KL: %.4f, TrainVAE_PPL: %.2f, TrainSVI_NLL: %.2f, TrainSVI_REC: %.2f, TrainSVI_KL: %.4f, TrainSVI_PPL: %.2f, KLInitFinal: %.2f, |Param|: %.4f, BestValPerf: %.2f, BestEpoch: %d, Beta: %.4f, Throughput: %.2f examples/sec' % (t, epoch, b + 1, len(train_data), cur_lr, train_nll_autoreg / num_sents, np.exp(train_nll_autoreg / num_words), (train_nll_vae + train_kl_vae) / num_sents, train_nll_vae / num_sents, train_kl_vae / num_sents, np.exp((train_nll_vae + train_kl_vae) / num_words), (train_nll_svi + train_kl_svi) / num_sents, train_nll_svi / num_sents, train_kl_svi / num_sents, np.exp((train_nll_svi + train_kl_svi) / num_words), train_kl_init_final / num_sents, param_norm, best_val_nll, best_epoch, args.beta, num_sents / (time.time() - start_time))) epoch_train_time = time.time() - start_time logger.info('Time Elapsed: %.1fs' % epoch_train_time) logger.info('--------------------------------') logger.info('Checking validation perf...') logger.record_tabular('Epoch', epoch) logger.record_tabular('Mode', 'Val') logger.record_tabular('LR', cur_lr) logger.record_tabular('Epoch Train Time', epoch_train_time) val_nll = eval(val_data, model, meta_optimizer) val_stats.append(val_nll) logger.info('--------------------------------') logger.info('Checking test perf...') logger.record_tabular('Epoch', epoch) logger.record_tabular('Mode', 'Test') logger.record_tabular('LR', cur_lr) logger.record_tabular('Epoch Train Time', epoch_train_time) test_nll = eval(test_data, model, meta_optimizer) if val_nll < best_val_nll: best_val_nll = val_nll best_epoch = epoch model.cpu() checkpoint = { 'args': args.__dict__, 'model': model, 'val_stats': val_stats } logger.info('Save checkpoint to %s' % checkpoint_path) torch.save(checkpoint, checkpoint_path) model.cuda() else: if epoch >= args.min_epochs: args.decay = 1