示例#1
0
    def valid(self, name, sess, valid_feed):
        elbo_losses = []
        rc_losses = []
        rc_ppls = []
        bow_losses = []
        kl_losses = []

        while True:
            batch = valid_feed.next_batch()
            if batch is None:
                break
            feed_dict = self.batch_2_feed(batch, None, use_prior=False, repeat=1)

            elbo_loss, bow_loss, rc_loss, rc_ppl, kl_loss = sess.run(
                [self.elbo, self.avg_bow_loss, self.avg_rc_loss,
                 self.rc_ppl, self.avg_kld], feed_dict)
            elbo_losses.append(elbo_loss)
            rc_losses.append(rc_loss)
            rc_ppls.append(rc_ppl)
            bow_losses.append(bow_loss)
            kl_losses.append(kl_loss)

        avg_losses = self.print_loss(name, ["elbo_loss", "bow_loss", "rc_loss", "rc_peplexity", "kl_loss"],
                                     [elbo_losses, bow_losses, rc_losses, rc_ppls, kl_losses], "")
        logger.record_tabular("elbo_loss", avg_losses[0])
        logger.record_tabular("bow_loss", avg_losses[1])
        logger.record_tabular("rc_loss", avg_losses[2] )
        logger.record_tabular("rc_peplexity", avg_losses[3])
        logger.record_tabular("kl_loss", avg_losses[4])
        logger.dump_tabular()

        return avg_losses[0]
示例#2
0
    def train(self):
        self.sess.run(tf.global_variables_initializer())
        self.start_worker()
        start_time = time.time()
        total_samples = 0
        for itr in range(0, self.n_itr):
            itr_start_time = time.time()
            logger.info('\n itr #%d' % itr)
            logger.info("Obtaining samples...")
            paths, n_samples = self.obtain_samples(itr)
            total_samples += n_samples

            logger.info("Processing samples...")
            samples_data = self.process_samples(itr, paths)

            logger.info("Optimizing policy...")
            self.optimize_policy(itr, samples_data)

            logger.info("Update stats...")
            self.update_stats(paths)

            logger.info("Fitting baseline...")
            self.fit_baseline(paths)

            logger.record_tabular('Time', time.time() - start_time)
            logger.record_tabular('ItrTime', time.time() - itr_start_time)
            logger.dump_tabular()

        self.shutdown_worker()
示例#3
0
def log_tabular_results(returns, itr, train_collection):
    logger.clear_tabular()
    logger.record_tabular('Iteration', itr)
    logger.record_tabular('episode_mean', np.mean(returns))
    logger.record_tabular('episode_min', np.min(returns))
    logger.record_tabular('episode_max', np.max(returns))
    logger.record_tabular('TotalSamples', train_collection.get_total_samples())

    logger.dump_tabular()
示例#4
0
def log_tabular_results(returns, itr, train_collection):
    logger.clear_tabular()
    logger.record_tabular('Iteration', itr)
    logger.record_tabular('AverageReturn', np.mean(returns))
    logger.record_tabular('MinimumReturn', np.min(returns))
    logger.record_tabular('MaximumReturn', np.max(returns))
    logger.record_tabular('TotalSamples', train_collection.get_total_samples())

    logger.dump_tabular()
示例#5
0
def validate(val_loader, model, criterion, epoch):
    batch_time = 0  #AverageMeter()
    data_time = 0  #AverageMeter()
    losses = 0  #AverageMeter()
    all_accs = 0  #AverageMeter()
    cls_accs = 0  #AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (input, target) in enumerate(val_loader):
        target = target.cuda()  #(async=True)
        #target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input, volatile=True)
        target_var = torch.autograd.Variable(target, volatile=True)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        all_acc, cls_acc = pascal_accuracy(output.data, target)
        # prec1, prec5 = pascal_accuracy(output.data, target, topk=(1, 5))
        losses += loss
        all_accs += all_acc
        cls_accs += cls_acc

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if i % args.print_freq == 0:
            abs_batch_time = batch_time / (i + 1)
            abs_data_time = data_time / (i + 1)
            abs_losses = losses.item() / (i + 1)
            abs_all_accs = all_accs.item() / (i + 1)
            logger.log(
                'Epoch: [{}][{}/{}]\t Time {}\t Data {}\t Loss {}\t All acs {} '
                .format(epoch, i, len(train_loader), abs_batch_time,
                        abs_data_time, abs_losses, abs_all_accs))

            logger.log((cls_accs / (i + 1)))

            logger.record_tabular('val/loss', loss.item())
            logger.record_tabular('val/accum_loss', abs_losses)
            logger.record_tabular('val/accum_all_acces', abs_all_accs)
            for i in range(cls_accs.shape[0]):
                logger.record_tabular('val/accum_cls_accs_{}'.format(i),
                                      cls_accs[i].item() / (i + 1))
                logger.record_tabular('val/cls_accs_{}'.format(i),
                                      cls_acc[i].item())

            logger.dump_tabular()

    return all_accs.item() / (i + 1)
示例#6
0
def main():
    logger.configure('logs/simulate')
    global T, n_bills, n_taxis, occupied
    results = []
    for n_lanes in range(2, 10):
        bills, n_taxis_left, n_passengers_left = [], [], []
        for seed in range(N_RUNS):
            np.random.seed(seed)
            occupied = [False for _ in range(n_lanes + 1)]
            T, n_bills, n_taxis, sta = 0, 0, 0, 0
            lanes = [
                Lane(i, n_lanes + 1, lam=0.1 / n_lanes) for i in range(n_lanes)
            ]
            enter = np.random.poisson(0.1, size=10000)
            while T < 10000:
                if sta == 0:
                    if n_taxis < M:
                        n_taxis += enter[T]
                    else:
                        sta = 1
                elif n_taxis < N:
                    sta = 0
                for lane in lanes:
                    lane.step()
                T += 1
            bills.append(n_bills)
            n_taxis_left.append(n_taxis)
            n_passengers_left.append(
                np.sum([lane.n_passengers for lane in lanes]))

        results.append(bills)

        logger.record_tabular('lanes', n_lanes)
        logger.record_tabular('bills mean', np.mean(bills))
        logger.record_tabular('bills std', np.std(bills))
        logger.record_tabular('taxis mean', np.mean(n_taxis_left))
        logger.record_tabular('passengers mean', np.mean(n_passengers_left))
        logger.dump_tabular()

    df = pd.DataFrame(np.reshape(results, -1)).rename(columns={0: '# bills'})
    df.insert(0, '# lanes', [i for i in range(2, 10) for _ in range(N_RUNS)],
              True)
    sns.boxplot(x='# lanes',
                y='# bills',
                data=df,
                showmeans=True,
                meanline=True)
    plt.grid(linestyle='--')
    plt.savefig('logs/simulate/boxplot.jpg')
    plt.show()
示例#7
0
def main(policy_file, seed, n_test_rollouts, render):
    set_global_seeds(seed)

    # Load policy.
    with open(policy_file, 'rb') as f:
        policy = pickle.load(f)
    env_name = policy.info['env_name']

    # Prepare params.
    params = config.DEFAULT_PARAMS
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params['env_name'] = env_name
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'compute_Q': True,
        'rollout_batch_size': 1,
        'render': bool(render),
    }

    for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
        eval_params[name] = params[name]

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(seed)

    # Run evaluation.
    evaluator.clear_history()
    for _ in range(n_test_rollouts):
        evaluator.generate_rollouts()

    # record logs
    for key, val in evaluator.logs('test'):
        logger.record_tabular(key, np.mean(val))
    logger.dump_tabular()
def evaluate(env,
             bc_agent_wrapper,
             num_trajs,
             render,
             exact_model_path=None,
             model_ckpt_dir=None):
    """Evaluate a trained SAM agent"""

    # Only one of the two arguments can be provided
    assert sum([exact_model_path is None, model_ckpt_dir is None]) == 1

    # Rebuild the computational graph
    pol = bc_agent_wrapper('pol')
    # Create episode generator
    traj_gen = traj_ep_generator(env, pol, render)
    # Initialize and load the previously learned weights into the freshly re-built graph
    U.initialize()
    if exact_model_path is not None:
        U.load_model(exact_model_path)
        logger.info(
            "model loaded from exact path:\n  {}".format(exact_model_path))
    else:  # `exact_model_path` is None -> `model_ckpt_dir` is not None
        U.load_latest_checkpoint(model_ckpt_dir)
        logger.info("model loaded from ckpt dir:\n  {}".format(model_ckpt_dir))
    # Initialize the history data structures
    ep_lens = []
    ep_env_rets = []
    # Collect trajectories
    for i in range(num_trajs):
        logger.info("evaluating [{}/{}]".format(i + 1, num_trajs))
        traj = traj_gen.__next__()
        ep_len, ep_env_ret = traj['ep_len'], traj['ep_env_ret']
        # Aggregate to the history data structures
        ep_lens.append(ep_len)
        ep_env_rets.append(ep_env_ret)
    # Log some statistics of the collected trajectories
    ep_len_mean = np.mean(ep_lens)
    ep_env_ret_mean = np.mean(ep_env_rets)
    logger.record_tabular("ep_len_mean", ep_len_mean)
    logger.record_tabular("ep_env_ret_mean", ep_env_ret_mean)
    logger.dump_tabular()
示例#9
0
def learn(env, model_path, data_path, policy_fn, *,
          rolloutSize, num_options=4, horizon=80,
          clip_param=0.025, ent_coeff=0.01,  # clipping parameter epsilon, entropy coeff
          optim_epochs=10, mainlr=3.25e-4, intlr=1e-4, piolr=1e-4, termlr=5e-7, optim_batchsize=100,  # optimization hypers
          gamma=0.99, lam=0.95,  # advantage estimation
          max_iters=20,  # time constraint
          adam_epsilon=1e-5,
          schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
          retrain=False,
          ):
    """
        Core learning function
    """
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space, ac_space, num_options=num_options)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space, num_options=num_options)  # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32,
                            shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None])
    op_adv = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
    betas = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    # Setup losses and stuff
    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-ent_coeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    term_loss = pi.tpred * term_adv

    activated_options = tf.placeholder(dtype=tf.float32, shape=[None, num_options])
    pi_w = tf.placeholder(dtype=tf.float32, shape=[None, num_options])
    option_hot = tf.one_hot(option, depth=num_options)
    pi_I = (pi.intfc * activated_options) * pi_w / tf.expand_dims(
        tf.reduce_sum((pi.intfc * activated_options) * pi_w, axis=1), 1)
    pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6)
    int_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv)

    intfc = tf.placeholder(dtype=tf.float32, shape=[None, num_options])
    pi_I = (intfc * activated_options) * pi.op_pi / tf.expand_dims(
        tf.reduce_sum((intfc * activated_options) * pi.op_pi, axis=1), 1)
    pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6)
    op_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv)

    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0))
    op_entropy = -tf.reduce_mean(pi.op_pi * log_pi, reduction_indices=1)
    op_loss -= 0.01 * tf.reduce_sum(op_entropy)

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option], losses + [U.flatgrad(total_loss, var_list)])
    termgrad = U.function([ob, option, term_adv],
                          [U.flatgrad(term_loss, var_list)])  # Since we will use a different step size.
    opgrad = U.function([ob, option, betas, op_adv, intfc, activated_options],
                        [U.flatgrad(op_loss, var_list)])  # Since we will use a different step size.
    intgrad = U.function([ob, option, betas, op_adv, pi_w, activated_options],
                         [U.flatgrad(int_loss, var_list)])  # Since we will use a different step size.
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
                                                    for (oldv, newv) in
                                                    zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)

    U.initialize()
    adam.sync()

    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=5)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=5)  # rolling buffer for episode rewards

    datas = [0 for _ in range(num_options)]

    if retrain:
        print("Retraining to New Task !! ")
        time.sleep(2)
        U.load_state(model_path+'/')

    p = []
    max_timesteps = int(horizon * rolloutSize * max_iters)
    while True:
        if max_iters and iters_so_far >= max_iters:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)
        render = False

        rollouts = sample_trajectory(pi, env, horizon=horizon, rolloutSize=rolloutSize, render=render)
        # Save rollouts
        data = {'rollouts': rollouts}
        p.append(data)
        del data
        data_file_name = data_path + 'rollout_data.pkl'
        pickle.dump(p, open(data_file_name, "wb"))

        add_vtarg_and_adv(rollouts, gamma, lam, num_options)

        opt_d = []
        for i in range(num_options):
            dur = np.mean(rollouts['opt_dur'][i]) if len(rollouts['opt_dur'][i]) > 0 else 0.
            opt_d.append(dur)

        ob, ac, opts, atarg, tdlamret = rollouts["ob"], rollouts["ac"], rollouts["opts"], rollouts["adv"], rollouts["tdlamret"]
        atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for policy
        assign_old_eq_new()  # set old parameter values to new parameter values

        # Optimizing the policy
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("Option- ", opt, " Batch Size: ", indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                continue

            datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent)

            if indices.size < optim_batchsize:
                print("Too few samples for opt - ", opt)
                continue

            optim_batchsize_corrected = optim_batchsize
            optim_epochs_corrected = np.clip(np.int(indices.size / optim_batchsize_corrected), 1, optim_epochs)
            print("Optim Epochs:", optim_epochs_corrected)
            logger.log("Optimizing...")
            # Here we do a bunch of optimization epochs over the data

            for _ in range(optim_epochs_corrected):
                losses = []  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize_corrected):
                    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"],
                                                    cur_lrmult, [opt])
                    adam.update(grads, mainlr * cur_lrmult)
                    losses.append(newlosses)

            # Optimize termination functions
            termg = termgrad(rollouts["ob"], rollouts['opts'], rollouts["op_adv"])[0]
            adam.update(termg, termlr)

            # Optimize interest functions
            intgrads = intgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["op_probs"], rollouts["activated_options"])[0]
            adam.update(intgrads, intlr)

        # Optimize policy over options
        opgrads = opgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["intfc"], rollouts["activated_options"])[0]
        adam.update(opgrads, piolr)

        lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("Success", rollouts["success"])
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

    return pi
示例#10
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
def main():
    # config for training
    config = Config()
    config.use_bow = False

    # config for validation
    valid_config = Config()
    valid_config.keep_prob = 1.0
    valid_config.dec_keep_prob = 1.0
    valid_config.batch_size = 60
    valid_config.use_bow = False

    # configuration for testing
    test_config = Config()
    test_config.keep_prob = 1.0
    test_config.dec_keep_prob = 1.0
    test_config.batch_size = 1
    test_config.use_bow = False

    pp(config)

    # get data set
    api = SWDADialogCorpus(FLAGS.data_dir,
                           word2vec=FLAGS.word2vec_path,
                           word2vec_dim=config.embed_size)
    dial_corpus = api.get_dialog_corpus()
    meta_corpus = api.get_meta_corpus()

    train_meta, valid_meta, test_meta = meta_corpus.get(
        "train"), meta_corpus.get("valid"), meta_corpus.get("test")
    train_dial, valid_dial, test_dial = dial_corpus.get(
        "train"), dial_corpus.get("valid"), dial_corpus.get("test")

    # convert to numeric input outputs that fits into TF models
    train_feed = SWDADataLoader("Train", train_dial, train_meta, config)
    valid_feed = SWDADataLoader("Valid", valid_dial, valid_meta, config)
    test_feed = SWDADataLoader("Test", test_dial, test_meta, config)

    # begin training
    sess_config = tf.ConfigProto(log_device_placement=False,
                                 allow_soft_placement=True)
    # sess_config.gpu_options.allow_growth = True
    sess_config.gpu_options.per_process_gpu_memory_fraction = 0.45
    with tf.Session(config=sess_config) as sess:
        initializer = tf.random_uniform_initializer(-1.0 * config.init_w,
                                                    config.init_w)
        scope = "model"
        with tf.variable_scope(scope, reuse=None, initializer=initializer):
            model = KgRnnCVAE(sess,
                              config,
                              api,
                              log_dir=None if FLAGS.forward_only else log_dir,
                              forward=False,
                              scope=scope)
        with tf.variable_scope(scope, reuse=True, initializer=initializer):
            valid_model = KgRnnCVAE(sess,
                                    valid_config,
                                    api,
                                    log_dir=None,
                                    forward=False,
                                    scope=scope)
        with tf.variable_scope(scope, reuse=True, initializer=initializer):
            test_model = KgRnnCVAE(sess,
                                   test_config,
                                   api,
                                   log_dir=None,
                                   forward=True,
                                   scope=scope)

        test_model.prepare_mul_ref()

        logger.info("Created computation graphs")
        if api.word2vec is not None and not FLAGS.forward_only:
            logger.info("Loaded word2vec")
            sess.run(model.embedding.assign(np.array(api.word2vec)))

        # write config to a file for logging
        if not FLAGS.forward_only:
            with open(os.path.join(log_dir, "run.log"), "wb") as f:
                f.write(pp(config, output=False))

        # create a folder by force
        ckp_dir = os.path.join(log_dir, "checkpoints")
        if not os.path.exists(ckp_dir):
            os.mkdir(ckp_dir)

        ckpt = tf.train.get_checkpoint_state(ckp_dir)
        logger.info("Created models with fresh parameters.")
        sess.run(tf.global_variables_initializer())

        if ckpt:
            logger.info("Reading dm models parameters from %s" %
                        ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)

        if not FLAGS.forward_only:
            dm_checkpoint_path = os.path.join(
                ckp_dir, model.__class__.__name__ + ".ckpt")
            global_t = 1
            patience = 10  # wait for at least 10 epoch before stop
            dev_loss_threshold = np.inf
            best_dev_loss = np.inf
            for epoch in range(config.max_epoch):
                logger.info(">> Epoch %d with lr %f" %
                            (epoch,
                             sess.run(model.learning_rate_cyc,
                                      {model.global_t: global_t})))

                # begin training
                if train_feed.num_batch is None or train_feed.ptr >= train_feed.num_batch:
                    train_feed.epoch_init(config.batch_size,
                                          config.backward_size,
                                          config.step_size,
                                          shuffle=True)
                global_t, train_loss = model.train(
                    global_t,
                    sess,
                    train_feed,
                    update_limit=config.update_limit)

                # begin validation
                logger.record_tabular("Epoch", epoch)
                logger.record_tabular("Mode", "Val")
                valid_feed.epoch_init(valid_config.batch_size,
                                      valid_config.backward_size,
                                      valid_config.step_size,
                                      shuffle=False,
                                      intra_shuffle=False)
                valid_loss = valid_model.valid("ELBO_VALID", sess, valid_feed)

                logger.record_tabular("Epoch", epoch)
                logger.record_tabular("Mode", "Test")
                test_feed.epoch_init(valid_config.batch_size,
                                     valid_config.backward_size,
                                     valid_config.step_size,
                                     shuffle=False,
                                     intra_shuffle=False)
                valid_model.valid("ELBO_TEST", sess, test_feed)

                # test_feed.epoch_init(test_config.batch_size, test_config.backward_size,
                #                      test_config.step_size, shuffle=True, intra_shuffle=False)
                # test_model.test_mul_ref(sess, test_feed, num_batch=5)

                done_epoch = epoch + 1
                # only save a models if the dev loss is smaller
                # Decrease learning rate if no improvement was seen over last 3 times.
                if config.op == "sgd" and done_epoch > config.lr_hold:
                    sess.run(model.learning_rate_decay_op)

                if valid_loss < best_dev_loss:
                    if valid_loss <= dev_loss_threshold * config.improve_threshold:
                        patience = max(patience,
                                       done_epoch * config.patient_increase)
                        dev_loss_threshold = valid_loss

                    # still save the best train model
                    if FLAGS.save_model:
                        logger.info("Save model!!")
                        model.saver.save(sess, dm_checkpoint_path)
                    best_dev_loss = valid_loss

                    if (epoch % 3) == 2:
                        tmp_model_path = os.path.join(
                            ckp_dir,
                            model.__class__.__name__ + str(epoch) + ".ckpt")
                        model.saver.save(sess, tmp_model_path)

                if config.early_stop and patience <= done_epoch:
                    logger.info("!!Early stop due to run out of patience!!")
                    break
            logger.info("Best validation loss %f" % best_dev_loss)
            logger.info("Done training")
        else:
            # begin validation
            # begin validation
            valid_feed.epoch_init(valid_config.batch_size,
                                  valid_config.backward_size,
                                  valid_config.step_size,
                                  shuffle=False,
                                  intra_shuffle=False)
            valid_model.valid("ELBO_VALID", sess, valid_feed)

            test_feed.epoch_init(valid_config.batch_size,
                                 valid_config.backward_size,
                                 valid_config.step_size,
                                 shuffle=False,
                                 intra_shuffle=False)
            valid_model.valid("ELBO_TEST", sess, test_feed)

            dest_f = open(os.path.join(log_dir, "test.txt"), "wb")
            test_feed.epoch_init(test_config.batch_size,
                                 test_config.backward_size,
                                 test_config.step_size,
                                 shuffle=False,
                                 intra_shuffle=False)
            test_model.test_mul_ref(sess,
                                    test_feed,
                                    num_batch=None,
                                    repeat=5,
                                    dest=dest_f)
            dest_f.close()
示例#12
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
          normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
          popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
          tau=0.01, eval_env=None, param_noise_adaption_interval=50):

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
                 gamma=gamma, tau=tau, normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    saver = tf.train.Saver()

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
示例#13
0
    def cartpole_train(self, rank, env, global_brain, agent, optimizer, global_t, send_rev, args):
        #global_total_loss = []
        o = env.reset()
        step = 0
        sum_rewards = 0
        max_sum_rewards = 0
        vs = []
        entropies = []
        sum_rewards = 0
        done = True
        #cnt = 0
        while global_t[0] < args.epoch:
            tmp = global_t.clone().item() + 1
            #print('cnt:',cnt)
            agent.local_brain.sync(global_brain) # local policy にコピー
            observations = []
            actions = []
            values = []
            rewards = []
            probs = []
            R = 0
            for _ in range(args.local_t_max):
                global_t += 1
                step += 1
                # Agentのactで行動を取得
                p, v = agent.local_brain(Variable(torch.from_numpy(o).float()).unsqueeze(0))
                a = agent.act(o)
                if len(a.data.squeeze().size()) == 0:
                    o, r, done, _ = env.step(a.data.squeeze().item())
                else:
                    o, r, done, _ = env.step(a.data.squeeze()[0])
                if r != 1:
                    print('-----------------------------------------------------------------------------------------------')
                if rank == 0:
                    sum_rewards += r
                    if args.render:
                        env.render()

                observations.append(o)
                actions.append(a)
                values.append(v)
                rewards.append(r)
                probs.append(p)
                if done:
                    o = env.reset()
                    #self.total_reward = 0
                    if rank == 0:
                        print('----------------------------------')
                        print('total reward of the episode:', sum_rewards)
                        print('----------------------------------')
                        if args.save_mode == 'all':
                            torch.save(agent.local_brain, os.path.join(args.log_dir, args.save_name+"_{}.pkl".format(global_t[0])))
                        elif args.save_mode == 'last':
                            torch.save(agent.local_brain, os.path.join(args.log_dir, args.save_name+'.pkl'))
                        elif args.save_mode == 'max':
                            if max_sum_rewards < sum_rewards:
                                torch.save(agent.local_brain, os.path.join(args.log_dir, args.save_name+'.pkl'))
                                max_sum_rewards = sum_rewards
                        step = 0
                    break
                else:
                    #self.total_reward += r
                    _, v = agent.local_brain(torch.from_numpy(o).unsqueeze(0).float())
                    R += v.data.squeeze().item()

            # -- Agent advantage_push_agent.local_brain() --- 割引報酬和の計算
            
            returns = []
            for r in rewards[::-1]: # 割引報酬和
                R = r + 0.99 * R
                returns.insert(0, R)
            returns = torch.Tensor(returns)


            #if len(returns) > 1:
            #    returns = (returns-returns.mean()) / (returns.std()+args.eps)

            # -- LocalBrain _build_graph() --- lossの計算

            loss, v_loss, entropy, p_loss_list = agent._loss_function(actions, values, probs, returns, args)

            vs.append(v_loss.data.numpy())
            entropies.append(entropy.data.numpy())

            self.global_history_reward.append([tmp, sum_rewards])

            ## 記録
            if rank == 0 and done:
                logger.record_tabular_misc_stat('Entropy', entropies)
                logger.record_tabular_misc_stat('V', vs)
                logger.record_tabular('reward', sum_rewards)
                logger.record_tabular('step', global_t[0])
                logger.dump_tabular()
                del vs[:]
                del entropies[:]
                sum_rewards = 0

            # 重みの更新(最後まで)
            optimizer.zero_grad()
            final_node = [loss] + p_loss_list
            #print('final_node',final_node)
            gradients = [torch.ones(1)] + [None] * len(p_loss_list)
            #print('gradients',gradients)
            autograd.backward(final_node, gradients)
            #print('after_final_node',final_node)
            #print('after_gradients',gradients)

            #raise
            # 学習率の更新
            new_lr = np.true_divide(args.epoch - global_t[0] , args.epoch * args.lr)
            optimizer.step(new_lr)

            # cnt += 1

        send_rev.send(self.global_history_reward)
示例#14
0
def main():

    logger.configure('{}{}_logs'.format(filePath, envName))
    for k, v in C.items():
        logger.record_tabular(k, v)
    logger.dump_tabular()

    logger.log('MsPacman')

    #Start the session
    sess = tf.InteractiveSession()

    train_env = make_env(C['env_id'], C['noop_max'])
    eval_env = make_env(C['env_id'], C['noop_max'])

    #Intitialize variables to record outputs
    train_track = [0.0]
    eval_track = []
    best_reward = 0

    train_reward = tf.placeholder(tf.float32)
    eval_reward = tf.placeholder(tf.float32)
    train_env = make_env(C['env_id'], C['noop_max'])
    eval_env = make_env(C['env_id'], C['noop_max'])
    agent = Agent(train_env, C)

    train_fs = reset_fs()
    train_s = train_env.reset()
    best_reward = 0
    train_mean = []
    eval_mean = []

    train_summary = tf.summary.scalar('train_reward', train_reward)
    eval_summary = tf.summary.scalar('eval_reward', eval_reward)
    writer = tf.summary.FileWriter('{}{}_summary'.format(filePath, envName),
                                   sess.graph)
    sess.run(tf.global_variables_initializer())

    agent.net.update_target_network()

    for it in range(C['iterations']):

        train_fs.append(train_s)

        train_a = agent.act(np.transpose(train_fs, (1, 2, 0)))
        ns, train_r, train_d, _ = train_env.step(train_a)
        #print('Iteration ',it, ' Reward ', train_r)
        train_track[-1] += train_r
        agent.record(train_s, train_a, train_r, float(train_d), it)
        train_s = ns

        if train_d:
            if train_env.env.env.was_real_done:  # one env for MsPacman, Freeway (No Fire action)
                if len(train_track) % 100 == 0:
                    mean = np.mean(train_track[-100:])
                    train_mean.append(mean)
                    summary = sess.run(train_summary,
                                       feed_dict={train_reward: mean})
                    writer.add_summary(summary, it)
                    logger.record_tabular('steps', it)
                    logger.record_tabular('episode', len(train_track))
                    logger.record_tabular('epsilon', 100 * agent.epsilon)
                    logger.record_tabular('learning rate', agent.lr)
                    logger.record_tabular('Mean Reward 100 episdoes', mean)
                    logger.dump_tabular()
                    with open(resultPath + 'reward_atari_base.pk1', 'wb') as f:
                        pickle.dump(train_track,
                                    f,
                                    protocol=pickle.HIGHEST_PROTOCOL)

                train_track.append(0.0)

            train_fs = reset_fs()
            train_s = train_env.reset()

        if (it + 1) % C['eval_freq'] == 0:

            for i in range(C['eval_episodes']):
                temp_video = []
                eval_track.append(0.0)
                eval_fs = reset_fs()
                eval_s = eval_env.reset()
                while True:
                    temp_video.append(eval_s)
                    eval_fs.append(eval_s)
                    eval_a = agent.greedy_act(np.transpose(eval_fs, (1, 2, 0)))
                    eval_s, eval_r, eval_d, _ = eval_env.step(eval_a)
                    eval_track[-1] += eval_r

                    if eval_env.env.env.was_real_done:
                        break
                    if eval_d:
                        eval_fs = reset_fs()
                        eval_s = eval_env.reset()

                if eval_track[-1] > best_reward:
                    best_reward = eval_track[-1]
                    best_video = temp_video
                    with open(resultPath + 'video_atari_base.pk1', 'wb') as f:
                        pickle.dump(best_video,
                                    f,
                                    protocol=pickle.HIGHEST_PROTOCOL)

            eval_mean.append(np.mean(eval_track[-C['eval_episodes']:]))
            summary = sess.run(eval_summary,
                               feed_dict={
                                   eval_reward:
                                   np.mean(eval_track[-C['eval_episodes']:])
                               })
            writer.add_summary(summary, it)

        if it == 1000000:
            outputs = agent.net.get_outputs(np.transpose(train_fs, (1, 2, 0)))
            with open(resultPath + 'outputs.pk1', 'wb') as f:
                pickle.dump(outputs, f, protocol=pickle.HIGHEST_PROTOCOL)
            with open(resultPath + 'outputs_screen.pk1', 'wb') as f:
                pickle.dump(train_fs, f, protocol=pickle.HIGHEST_PROTOCOL)

    with open(resultPath + 'reward_atari_base.pk1', 'wb') as f:
        pickle.dump(train_track, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(resultPath + 'trainMean_atari_base.pk1', 'wb') as f:
        pickle.dump(train_mean, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(resultPath + 'evalMean_atari_base.pk1', 'wb') as f:
        pickle.dump(eval_mean, f, protocol=pickle.HIGHEST_PROTOCOL)
    agent.net.save(filePath + '{}_model2'.format(C['env_id']))
    sess.close()
示例#15
0
def testing(
        save_path,
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=50,
        nb_rollout_steps=3,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        # no noise for test
        #   noise_type='adaptive-param_0.2',
        #   noise_type='normal_0.9',
        #   noise_type='ou_0.9',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        #   actor_lr=1e-6,
        #   critic_lr=1e-5,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=3,  # per epoch cycle and MPI worker,  50
        nb_eval_steps=1,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=3,  #
        **network_kwargs):

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    # nb_actions = env.action_space.shape[-1]
    # nb_actions = 2*env.grid_size
    nb_actions = env.grid_size
    action_shape = np.array(nb_actions * [0]).shape
    nb_features = (4 + 1) * env.grid_size
    observation_shape = np.array(nb_features * [0]).shape
    grid_x = env.grid_x
    grid_y = env.grid_y
    x = []
    y = []
    for i in range(grid_x):
        x.append(i + 1)
    for i in range(grid_y):
        y.append(i + 1)
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6),
                    action_shape=action_shape,
                    observation_shape=observation_shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    '''no noise for test'''
    # if noise_type is not None:
    #     for current_noise_type in noise_type.split(','):
    #         current_noise_type = current_noise_type.strip()
    #         if current_noise_type == 'none':
    #             pass
    #         elif 'adaptive-param' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
    #         elif 'normal' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    #         elif 'ou' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    #         else:
    #             raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # max_action = env.action_space.high
    # logger.info('scaling actions by {} before executing in env'.format(max_action))

    # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
    agent = DDPG(actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    # agent.initialize(sess)
    # sess.graph.finalize()
    agent.load(sess, save_path)

    agent.reset()

    obs, env_state = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    step_set = []
    reward_set = []

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    average_reward = []
    mean_epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_state = []
    epoch_episodes = 0
    #record the car numbers in each step
    car_num_set = {}
    t_set = [i for i in range(total_timesteps)]
    for xx in x:
        for yy in y:
            lab = str(xx) + str(yy)
            car_num_set[lab] = [[0 for i in range(total_timesteps)]
                                for j in range(4)]

    for epoch in range(nb_epochs):
        obs, env_state = env.reset()
        epoch_actions = []
        epoch_state = []
        average_car_num_set = []
        last_action = 1
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            action, q, _, _ = agent.step(obs,
                                         apply_noise=False,
                                         compute_Q=True)
            '''random action'''
            # if np.random.rand()>0.5:
            #     action=[1]
            # else:
            #     action=[0]
            '''cycle light state'''
            # action=[0]
            '''cycle action (should cycle state instead of action)'''
            # if last_action==1:
            #     action=[0]
            # else:
            #     action=[1]
            # last_action=action[0]

            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                new_obs, r, env_state, done = env.step(action, env_state)
                epoch_state.append(env_state['11'].light_state)
                for xx in x:
                    for yy in y:
                        lab = str(xx) + str(yy)
                        for i in range(4):
                            car_num_set[lab][i][t] = (
                                env_state['11'].car_nums[i])
                t += 1
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                b = 1.
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.
                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        print('done')
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            epoch_episode_rewards.append(episode_reward)
            average_reward.append(episode_reward / nb_rollout_steps)

            episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            # for t_train in range(nb_train_steps):
            #     # Adapt param noise, if necessary.
            #     if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
            #         distance = agent.adapt_param_noise()
            #         epoch_adaptive_distances.append(distance)
            #     # print('Train!')
            #     cl, al = agent.train()
            #     epoch_critic_losses.append(cl)
            #     epoch_actor_losses.append(al)
            #     agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0
            step_set.append(t)

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards))
        # print(step_set,mean_epoch_episode_rewards)
        # plt.figure(figsize=(8,5))
        '''plot rewards-steps'''
        ax1 = plt.subplot(2, 1, 1)
        plt.sca(ax1)
        plt.plot(step_set, average_reward, color='b')
        # plt.xlabel('Steps')
        plt.ylabel('Mean Reward', fontsize=12)
        # plt.ylim(-15000,0)
        '''plot queueing car numbers-steps'''
        ax2 = plt.subplot(2, 1, 2)
        plt.sca(ax2)
        print(np.shape(t_set), np.shape(car_num_set['11'][i]))
        for i in range(4):
            if i == 0:
                plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='b')
            elif i == 1:
                plt.plot(t_set,
                         car_num_set['11'][i],
                         '--',
                         label=i,
                         color='orange')
            elif i == 2:
                plt.plot(t_set, car_num_set['11'][i], label=i, color='g')
            else:
                plt.plot(t_set, car_num_set['11'][i], label=i, color='r')
        plt.ylim(0, 100)
        #sum among roads
        sum_car_num = np.sum(car_num_set['11'], axis=0)
        #average among time steps
        average_car_num = np.average(sum_car_num)
        average_car_num_set.append(average_car_num)

        plt.xlabel('Steps', fontsize=12)
        plt.ylabel('Cars Numbers', fontsize=12)
        # set legend
        handles, labels = plt.gca().get_legend_handles_labels()
        by_label = OrderedDict(zip(labels, handles))
        leg = plt.legend(by_label.values(), by_label.keys(), loc=1)
        # leg = plt.legend(loc=4)
        legfm = leg.get_frame()
        legfm.set_edgecolor('black')  # set legend fame color
        legfm.set_linewidth(0.5)  # set legend fame linewidth
        plt.savefig('ddpg_mean_test.pdf')
        plt.show()
        print(epoch_state)

        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(
            np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)
    print('average queueing car numbers: ', np.average(average_car_num_set))

    return agent
示例#16
0
def train(args):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    dataset = pickle.load(
        open("data/" + args.expert_file + "_" + str(args.num_sampled), "rb"))
    dataset.min_reward = 0
    dataset.max_reward = 1
    action_getter = utils.ActionGetter(
        atari.env.action_space.n,
        replay_memory_start_size=REPLAY_MEMORY_START_SIZE,
        max_frames=MAX_FRAMES,
        eps_initial=args.initial_exploration)

    utils.generate_weights(dataset)
    saver = tf.train.Saver(max_to_keep=10)
    sess = tf.Session(config=config)
    sess.run(init)
    fixed_state = np.expand_dims(atari.fixed_state(sess), axis=0)

    if args.checkpoint_index >= 0:
        saver.restore(
            sess, args.checkpoint_dir + args.env_id + "/" + "seed_" +
            str(args.seed) + "/" + "model--" + str(args.checkpoint_index))
        print(
            "Loaded Model ... ", args.checkpoint_dir + args.env_id + "seed_" +
            str(args.seed) + "/" + "model--" + str(args.checkpoint_index))
    logger.configure(args.log_dir + args.env_id + "/" + "seed_" +
                     str(args.seed) + "/")
    if not os.path.exists(args.gif_dir + args.env_id + "/" + "seed_" +
                          str(args.seed) + "/"):
        os.makedirs(args.gif_dir + args.env_id + "/" + "seed_" +
                    str(args.seed) + "/")
    if not os.path.exists(args.checkpoint_dir + args.env_id + "/" + "seed_" +
                          str(args.seed) + "/"):
        os.makedirs(args.checkpoint_dir + args.env_id + "/" + "seed_" +
                    str(args.seed) + "/")

    frame_number = 0
    loss_list = []
    epoch = 0
    while frame_number < MAX_FRAMES:
        print("Training Model ...")
        epoch_frame = 0
        start_time = time.time()
        for j in tqdm(range(EVAL_FREQUENCY // BS)):
            loss = learn(sess,
                         dataset,
                         MAIN_DQN,
                         TARGET_DQN,
                         BS,
                         gamma=DISCOUNT_FACTOR)  # (8★)
            loss_list.append(loss)
            # Output the progress:
        # logger.log("Runing frame number {0}".format(frame_number))
        logger.record_tabular("frame_number", frame_number)
        logger.record_tabular("td loss", np.mean(loss_list[-100:]))
        q_vals = sess.run(MAIN_DQN.action_prob,
                          feed_dict={MAIN_DQN.input: fixed_state})
        for i in range(atari.env.action_space.n):
            logger.record_tabular("q_val action {0}".format(i), q_vals[0, i])
        utils.test_q_values(sess, dataset, atari, action_getter, MAIN_DQN,
                            MAIN_DQN.input, MAIN_DQN.action_prob_expert, BS)
        print("Current Frame: ", frame_number)
        print("TD Loss: ", np.mean(loss_list[-100:]))

        # Evaluation ...
        gif = True
        frames_for_gif = []
        eval_rewards = []
        evaluate_frame_number = 0
        print("Evaluating Model.... ")
        while evaluate_frame_number < EVAL_STEPS:
            terminal_life_lost = atari.reset(sess, evaluation=True)
            episode_reward_sum = 0
            for _ in range(MAX_EPISODE_LENGTH):
                # Fire (action 1), when a life was lost or the game just started,
                # so that the agent does not stand around doing nothing. When playing
                # with other environments, you might want to change this...
                action = 1 if terminal_life_lost and args.env_id == "BreakoutDeterministic-v4" else action_getter.get_action(
                    sess, frame_number, atari.state, MAIN_DQN, evaluation=True)
                processed_new_frame, reward, terminal, terminal_life_lost, new_frame = atari.step(
                    sess, action)
                evaluate_frame_number += 1
                episode_reward_sum += reward
                if gif:
                    frames_for_gif.append(new_frame)
                if terminal:
                    eval_rewards.append(episode_reward_sum)
                    gif = False  # Save only the first game of the evaluation as a gif
                    break
            if len(eval_rewards) % 10 == 0:
                print("Evaluation Completion: ",
                      str(evaluate_frame_number) + "/" + str(EVAL_STEPS))
        print("Evaluation score:\n", np.mean(eval_rewards))
        try:
            utils.generate_gif(
                frame_number, frames_for_gif, eval_rewards[0], args.gif_dir +
                args.env_id + "/" + "seed_" + str(args.seed) + "/")
        except IndexError:
            print("No evaluation game finished")
        logger.log("Average Evaluation Reward", np.mean(eval_rewards))
        logger.log("Average Sequence Length",
                   evaluate_frame_number / len(eval_rewards))
        # Save the network parameters
        saver.save(sess,
                   args.checkpoint_dir + args.env_id + "/" + "seed_" +
                   str(args.seed) + "/" + 'model-',
                   global_step=frame_number)
        print("Runtime: ", time.time() - start_time)
        print("Epoch: ", epoch, "Total Frames: ", frame_number)
        epoch += 1
        logger.dumpkvs()
示例#17
0
def train(env, nb_epochs, nb_epoch_cycles, normalize_observations, actor_lr, critic_lr, action_noise,
          gamma, nb_train_steps, nb_rollout_steps, batch_size, memory, tau=0.01):

    max_action = env.action_space.high
    agent = DDPG(memory, env.observation_space.shape[0], env.action_space.shape[0],
                 gamma=gamma, tau=tau,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size, action_noise=action_noise,
                 actor_lr=actor_lr, critic_lr=critic_lr,
                 )
    if USE_CUDA:
        agent.cuda()
    # Set up logging stuff only for a single worker.
    step = 0
    episode = 0
    episode_rewards_history = deque(maxlen=100)
    # Prepare everything.

    agent.reset()
    obs = env.reset()
    done = False
    episode_reward = 0.
    episode_step = 0
    episodes = 0
    t = 0

    epoch = 0
    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_start_time = time.time()
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                assert action.shape == env.action_space.shape

                # Execute next action.
                assert max_action.shape == action.shape
                new_obs, r, done, info = env.step(max_action * action)
                t += 1
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs

                if done:
                    # Episode done.
                    epoch_episode_rewards.append(episode_reward)
                    episode_rewards_history.append(episode_reward)
                    epoch_episode_steps.append(episode_step)
                    episode_reward = 0.
                    episode_step = 0
                    epoch_episodes += 1
                    episodes += 1

                    agent.reset()
                    obs = env.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            for t_train in range(nb_train_steps):
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        combined_stats = dict()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])
        logger.dump_tabular()
        logger.info('')
    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        all_path_baselines = [
            self.algo.baseline.predict(path) for path in paths
        ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                self.algo.discount * path_baselines[1:] - \
                path_baselines[:-1]
            path["advantages"] = misc_utils.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = misc_utils.discount_cumsum(
                path["rewards"], self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

            # a trick to reduce variance but gives biased gradient
            path["value_targets"] = path["advantages"] + np.array(
                path_baselines[:-1])

        ev = misc_utils.explained_variance_1d(np.concatenate(baselines),
                                              np.concatenate(returns))

        max_path_length = max([len(path["advantages"]) for path in paths])

        # make all paths the same length (pad extra advantages with 0)
        obs = [path["observations"] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        if self.algo.center_adv:
            raw_adv = np.concatenate([path["advantages"] for path in paths])
            adv_mean = np.mean(raw_adv)
            adv_std = np.std(raw_adv) + 1e-8
            adv = [(path["advantages"] - adv_mean) / adv_std for path in paths]
        else:
            adv = [path["advantages"] for path in paths]

        adv = np.asarray(
            [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

        actions = [path["actions"] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path["rewards"] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        returns = [path["returns"] for path in paths]
        returns = tensor_utils.pad_tensor_n(returns, max_path_length)

        agent_infos = [path["agent_infos"] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path["env_infos"] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path["returns"]) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        average_discounted_return = \
            np.mean([path["returns"][0] for path in paths])

        undiscounted_returns = [sum(path["rewards"]) for path in paths]

        samples_data = dict(
            observations=obs,
            actions=actions,
            advantages=adv,
            rewards=rewards,
            returns=returns,
            valids=valids,
            agent_infos=agent_infos,
            env_infos=env_infos,
            paths=paths,
        )

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
    def obtain_samples(self, itr, dynamics=None):
        logger.info("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        # dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            if dynamics:
                rewards = dynamics.process_rewards(rewards, obses, actions,
                                                   next_obses)
            env_time += time.time() - t

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, obs, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):

                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(obs)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(
                            observations=self.algo.env.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.algo.env.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            obses = next_obses

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths, n_samples
示例#20
0
    def cartpole_train_3_5(self, rank, args):
        torch.manual_seed(args.seed + rank)

        self.agent.local_brain.train()

        step = 0
        sum_rewards = 0
        max_sum_rewards = 0
        vs = []
        entropies = []
        cnt = 0

        while self.g_ep.value < args.epoch:
            #tmp = 0
            o = self.env.reset()
            #o = torch.from_numpy(state)
            #print('cnt:',cnt)
            # self.agent.local_brain.sync(self.global_brain) # local policy にコピー
            observations, actions, values, rewards, probs = [], [], [], [], []
            #R = 0
            #done = True
            ep_r = 0.
            while True:
                step += 1
                # Agentのactで行動を取得
                p, v = self.agent.local_brain(Variable(torch.from_numpy(o).float()).unsqueeze(0))
                a = self.agent.act(o)
                if len(a.data.squeeze().size()) == 0:
                    o, r, done, _ = self.env.step(a.data.squeeze().item())
                else:
                    o, r, done, _ = self.env.step(a.data.squeeze()[0])
                if done: r = -1
                if rank == 0:
                    sum_rewards += r
                    if args.render:
                        self.env.render()
                ep_r += r
                observations.append(o)
                actions.append(a)
                values.append(v)
                rewards.append(r)
                probs.append(p)

                if step % args.local_t_max == 0 or done:
                    if done:
                        R = 0
                    else:
                        _, v = self.agent.local_brain(torch.from_numpy(observations[-1]).unsqueeze(0).float())
                        R = v.data.squeeze().item()

                    returns = []
                    for r in rewards[::-1]: # 割引報酬和
                        R = r + 0.99 * R
                        returns.insert(0, R)
                    returns = torch.Tensor(returns)


                    loss, v_loss, entropy, _ = self.agent._loss_function(actions, values, probs, returns, args)
                    vs.append(v_loss.data.numpy())
                    entropies.append(entropy.data.numpy())

                    ## 記録
                    if rank == 0 and done:
                        logger.record_tabular_misc_stat('Entropy', entropies)
                        logger.record_tabular_misc_stat('V', vs)
                        logger.record_tabular('reward', sum_rewards)
                        logger.record_tabular('step', self.g_ep.value)
                        logger.dump_tabular()
                        del vs[:]
                        del entropies[:]
                    self.optimizer.zero_grad()
                    loss.backward(retain_graph=True)
                    for lp, gp in zip(self.agent.local_brain.parameters(), self.global_brain.parameters()):
                        gp._grad = lp.grad

                    self.optimizer.step()
                    self.agent.local_brain.sync(self.global_brain) # local policy にコピー

                    observations, actions, values, rewards, probs = [], [], [], [], []

                if done:
                    with self.g_ep.get_lock():
                        self.g_ep.value += 1
                    with self.g_ep_r.get_lock():
                        if self.g_ep_r.value == 0.:
                            self.g_ep_r.value = ep_r
                        else:
                            self.g_ep_r.value = self.g_ep_r.value * 0.99 + ep_r * 0.01
                    self.res_queue.put(self.g_ep_r.value)

                    o = self.env.reset()
                    #self.global_history_reward.append([tmp, self.total_reward])
                    self.total_reward = 0
                    if rank == 0:
                        print('----------------------------------')
                        print('total reward of the episode:', sum_rewards)
                        print('----------------------------------')
                        if args.save_mode == 'all':
                            torch.save(self.agent.local_brain, os.path.join(args.log_dir, args.save_name+"_{}.pkl".format(self.g_ep.value)))
                        elif args.save_mode == 'last':
                            torch.save(self.agent.local_brain, os.path.join(args.log_dir, args.save_name+'.pkl'))
                        elif args.save_mode == 'max':
                            if max_sum_rewards < sum_rewards:
                                torch.save(self.agent.local_brain, os.path.join(args.log_dir, args.save_name+'.pkl'))
                                max_sum_rewards = sum_rewards
                        #step = 0
                        sum_rewards = 0
                    break

            #raise
            # 学習率の更新
            # new_lr = np.true_divide(args.epoch - global_t[0] , args.epoch * args.lr)
            # self.optimizer.step(new_lr)

            cnt += 1

        #send_rev.send(self.global_history_reward)
        self.res_queue.put(None)
示例#21
0
def train(policy, planner, rollout_worker, evaluator, n_epochs,
          n_test_rollouts, n_cycles, n_batches, policy_save_interval,
          save_path, **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()

    if save_path:
        latest_mdl_path = save_path + '_latest'
        best_mdl_path = save_path
        periodic_policy_path = save_path + '_{}'

    best_success_rate = -1

    logger.info('Training......')
    # num_timesteps = n_epochs * n_cycles * rollout_length * number of rollout workers
    for epoch in range(n_epochs):
        logger.info('========== epoch {} ========='.format(epoch))
        logger.record_tabular('epoch', epoch)

        # train
        rollout_worker.clear_history()
        for _ in range(n_cycles):
            # logger.info('collect rollouts...')
            episode_for_act, episode_for_pln = rollout_worker.generate_rollouts(
                cur_progress=(epoch / n_epochs))
            # logger.info('store rollouts for policy')
            policy.store_episode(episode_for_act)
            # logger.info('store rollouts for planner, episodes_for_pln shape:', episode_for_pln.shape)
            planner.store_episode(episode_for_pln)
            # logger.info('training policy')
            for _ in range(n_batches):
                policy.train()
            policy.update_target_net()
            # logger.info('training planner')
            for _ in range(n_batches):
                planner.train(use_buffer=True)

        # test
        # logger.info("evaluate...")
        evaluator.clear_history()
        for ro in range(n_test_rollouts):
            evaluator.generate_rollouts()

        for key, val in evaluator.logs('test'):
            logger.record_tabular(key, mpi_average(val))
        for key, val in rollout_worker.logs('train'):
            logger.record_tabular(key, mpi_average(val))
        for key, val in policy.logs():
            logger.record_tabular(key, mpi_average(val))
        for key, val in planner.logs():
            logger.record_tabular(key, mpi_average(val))
        if rank == 0:
            logger.dump_tabular()

        success_rate = mpi_average(evaluator.current_success_rate())
        if rank == 0 and success_rate >= best_success_rate and save_path:
            best_success_rate = success_rate
            # logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path))
            # evaluator.save_policy(latest_mdl_path)
            logger.info(
                'Saving best policy+planner to {} ...'.format(best_mdl_path))
            evaluator.save_policy(best_mdl_path)
            evaluator.save_planner(best_mdl_path)
        if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_path:
            # policy_path = periodic_policy_path.format(epoch)
            logger.info('Saving lastest policy+planner to {} ...'.format(
                latest_mdl_path))
            evaluator.save_policy(latest_mdl_path)
            evaluator.save_planner(latest_mdl_path)
        elif rank == 0 and policy_save_interval < 0 and epoch % (
                -policy_save_interval) == 0 and save_path:
            periodic_mdl_path = periodic_policy_path.format(epoch)
            logger.info('Saving periodic policy+planner to {} ...'.format(
                periodic_mdl_path))
            evaluator.save_policy(periodic_mdl_path)
            evaluator.save_planner(periodic_mdl_path)

        local_uniform = np.random.uniform(size=(1, ))
        root_uniform = local_uniform.copy()
        MPI.COMM_WORLD.Bcast(root_uniform, root=0)
        if rank != 0:
            assert local_uniform[0] != root_uniform[0]

    return policy, planner
示例#22
0
文件: long_short.py 项目: gohsyi/taxi
def main():
    logger.configure('logs/long_short')

    global T, PRIORITY, DICHOTOMY, ENTROPY

    for PRIORITY, DICHOTOMY, ENTROPY in [
        (True, False, True),
        (True, False, False),
        (False, True, False),
        (False, False, False),
    ]:
        income_means, income_stds = [], []
        short_ratios, long_ratios = [], []
        short_passengers, long_passengers = [], []
        for seed in range(N_RUNS):
            np.random.seed(seed)
            T = 0
            g_lanes.clear()
            g_lanes.update({'short': Lane(), 'long': Lane()})
            # short_passengers, long_passengers = [], []
            enter_passengers = np.random.poisson(0.1, size=LENGTH)

            g_taxis.clear()
            for i in range(N_TAXIS // 2):
                g_taxis.append(Taxi(i))
                enter(g_taxis[-1], g_lanes['short'])
            for i in range(N_TAXIS // 2):
                g_taxis.append(Taxi(i + N_TAXIS // 2))
                enter(g_taxis[-1], g_lanes['long'])

            while T < LENGTH:
                if enter_passengers[T]:
                    dist = max(
                        2, np.random.choice(range(len(DISTANCES)),
                                            p=DISTANCES))
                    p = Passenger(dist)
                    if not DICHOTOMY:
                        lane = RANDOM_LANE()
                    elif p.distance <= THRESHOLD:
                        lane = g_lanes['short']
                    else:
                        lane = g_lanes['long']
                    lane.passengers.append(p)

                g_lanes['short'].step()
                g_lanes['long'].step()
                for taxi in g_taxis:
                    taxi.step()

                short_passengers.append(len(g_lanes['short'].passengers))
                long_passengers.append(len(g_lanes['long'].passengers))

                T += 1

            incomes = [np.sum(t.incomes) for t in g_taxis]

            income_means.append(np.mean(incomes))
            income_stds.append(np.std(incomes))
            short_ratios.append(
                np.mean([r for t in g_taxis for r in t.income_ratio['short']]))
            long_ratios.append(
                np.mean([r for t in g_taxis for r in t.income_ratio['long']]))

        # logger.info(income_means)
        # logger.info(income_stds)
        logger.record_tabular('*priority*', PRIORITY)
        logger.record_tabular('*dichotomy*', DICHOTOMY)
        logger.record_tabular('*entropy*', ENTROPY)
        logger.record_tabular('income mean', np.mean(income_means))
        logger.record_tabular('income std', np.mean(income_stds))
        logger.record_tabular('queuing time mean',
                              np.mean([t.queue_time for t in g_taxis]))
        logger.record_tabular('short income ratio mean',
                              np.mean(short_ratios) * 3600)
        logger.record_tabular('short income ratio std',
                              np.std(short_ratios) * 3600)
        logger.record_tabular('long income ratio mean',
                              np.mean(long_ratios) * 3600)
        logger.record_tabular('long income ratio std',
                              np.std(long_ratios) * 3600)
        logger.record_tabular('# short lane passengers',
                              np.mean(short_passengers))
        logger.record_tabular('# long lane passengers',
                              np.mean(long_passengers))
        logger.dump_tabular()
示例#23
0
def learn(env,
          sess,
          seed,
          nsteps=5,
          total_timesteps=int(80e4),
          discount=0.5,
          entropy_coeff=0.01,
          lr=7e-4,
          lr_decay=0.99,
          fuzz_factor=0.00001,
          max_grad_norm=0.5,
          log_interval=100):
    env.init()
    action_set = env.getActionSet()
    n_actions = len(action_set)
    state_dim = env.getGameState().size  # Reset environment

    total_returns = []

    # Init actorCritic
    actor_critic = ActorCritic(state_dim, n_actions, nsteps, discount,
                               entropy_coeff, lr, lr_decay, fuzz_factor,
                               total_timesteps, max_grad_norm)
    sim = Simulation(env, actor_critic, nsteps=nsteps, discount=discount)
    sim.start_episode()
    e_cnt = 0
    for nupdate in range(int(total_timesteps / nsteps)):
        if env.game_over():
            # done = True
            total_returns.append(sim.total_return)
            sim.start_episode()
            e_cnt = e_cnt + 1

        # Collect n-step trajectories
        obs, rewards, actions, values, dones, states = sim.run_nsteps()

        # Update train_model
        policy_loss, value_loss, policy_entropy, a_dist = \
            actor_critic.train(obs, actions, rewards, values, dones, states)
        # print('action probs:')
        # print(ap[0], a)

        if nupdate % log_interval == 0 or nupdate == 1:
            # ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", nupdate)
            logger.record_tabular("nepisode", e_cnt)
            # logger.record_tabular("total_timesteps", nupdate * nsteps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular(
                "avg. total return",
                np.mean(total_returns[-(min(len(total_returns), 100)):]))
            # logger.record_tabular("explained_variance", float(ev))
            logger.dump_tabular()
    return actor_critic
示例#24
0
def learn(env,
          policy_func,
          reward_giver,
          reward_guidance,
          expert_dataset,
          rank,
          pretrained,
          pretrained_weight,
          *,
          g_step,
          d_step,
          entcoeff,
          save_per_iter,
          ckpt_dir,
          log_dir,
          timesteps_per_batch,
          task_name,
          gamma,
          lam,
          algo,
          max_kl,
          cg_iters,
          cg_damping=1e-2,
          vf_stepsize=3e-4,
          d_stepsize=1e-4,
          vf_iters=3,
          max_timesteps=0,
          max_episodes=0,
          max_iters=0,
          loss_percent=0.0,
          callback=None):

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    policy = build_policy(env, 'mlp', value_network='copy')

    ob = observation_placeholder(ob_space)
    with tf.variable_scope('pi'):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope('oldpi'):
        oldpi = policy(observ_placeholder=ob)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables('pi')
    # var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")]
    # vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")
    # assert len(var_list) == len(vf_var_list) + 1
    d_adam = MpiAdam(reward_giver.get_trainable_variables())
    guidance_adam = MpiAdam(reward_guidance.get_trainable_variables())

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables('oldpi'), get_variables('pi'))
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    guidance_adam.sync()
    vfadam.sync()
    if rank == 0:
        print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     reward_giver,
                                     reward_guidance,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     algo=algo,
                                     loss_percent=loss_percent)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(reward_giver.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        # Save model
        if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            fname = os.path.join(ckpt_dir, task_name)
            os.makedirs(os.path.dirname(fname), exist_ok=True)
            saver = tf.train.Saver()
            saver.save(tf.get_default_session(), fname)

        logger.log("********** Iteration %i ************" % iters_so_far)

        # global flag_render
        # if iters_so_far > 0 and iters_so_far % 10 ==0:
        #     flag_render = True
        # else:
        #     flag_render = False

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
            print('rewards', seg['rew'])
            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=128):
                        if hasattr(pi, "ob_rms"):
                            pi.ob_rms.update(
                                mbob)  # update running mean/std for policy
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)

        g_losses = meanlosses
        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, reward_giver.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(
            batch_size=len(ob))
        batch_size = 128
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        with timed("Discriminator"):
            for (ob_batch, ac_batch) in dataset.iterbatches(
                (ob, ac),
                    include_final_partial_batch=False,
                    batch_size=batch_size):
                ob_expert, ac_expert = expert_dataset.get_next_batch(
                    batch_size=batch_size)
                # update running mean/std for reward_giver
                if hasattr(reward_giver, "obs_rms"):
                    reward_giver.obs_rms.update(
                        np.concatenate((ob_batch, ob_expert), 0))
                *newlosses, g = reward_giver.lossandgrad(ob_batch, ob_expert)
                d_adam.update(allmean(g), d_stepsize)
                d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

        # ------------------ Update Guidance ------------
        logger.log("Optimizing Guidance...")

        logger.log(fmt_row(13, reward_guidance.loss_name))
        batch_size = 128
        guidance_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        with timed("Guidance"):
            for ob_batch, ac_batch in dataset.iterbatches(
                (ob, ac),
                    include_final_partial_batch=False,
                    batch_size=batch_size):
                ob_expert, ac_expert = expert_dataset.get_next_batch(
                    batch_size=batch_size)

                idx_condition = process_expert(ob_expert, ac_expert)
                pick_idx = (idx_condition >= loss_percent)
                # pick_idx = idx_condition

                ob_expert_p = ob_expert[pick_idx]
                ac_expert_p = ac_expert[pick_idx]

                ac_batch_p = []
                for each_ob in ob_expert_p:
                    tmp_ac, _, _, _ = pi.step(each_ob, stochastic=True)
                    ac_batch_p.append(tmp_ac)

                # update running mean/std for reward_giver
                if hasattr(reward_guidance, "obs_rms"):
                    reward_guidance.obs_rms.update(ob_expert_p)
                # reward_guidance.train(expert_s=ob_batch_p, agent_a=ac_batch_p, expert_a=ac_expert_p)
                *newlosses, g = reward_guidance.lossandgrad(
                    ob_expert_p, ac_batch_p, ac_expert_p)
                guidance_adam.update(allmean(g), d_stepsize)
                guidance_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(guidance_losses, axis=0)))

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens) * g_step
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
示例#25
0
def train(rank, global_policy, local_policy, optimizer, env, global_t, args):
    o = env.reset()
    step = 0
    sum_rewards = 0
    max_sum_rewards = 0
    vs = []
    entropies = []
    sum_rewards = 0
    while global_t[0] < args.epoch:
        local_policy.sync(global_policy)
        observations = []
        actions = []
        values = []
        rewards = []
        probs = []
        R = 0
        for i in range(args.local_t_max):
            global_t += 1
            step += 1
            p, v = local_policy(
                Variable(torch.from_numpy(o).float()).unsqueeze(0))
            a = p.multinomial()
            o, r, done, _ = env.step(a.data.squeeze()[0])
            if rank == 0:
                sum_rewards += r
                if args.render:
                    env.render()
            observations.append(o)
            actions.append(a)
            values.append(v)
            rewards.append(r)
            probs.append(p)
            if done:
                o = env.reset()
                if rank == 0:
                    print('----------------------------------')
                    print('total reward of the episode:', sum_rewards)
                    print('----------------------------------')
                    if args.save_mode == 'all':
                        torch.save(
                            local_policy,
                            os.path.join(
                                args.log_dir, args.save_name +
                                "_{}.pkl".format(global_t[0])))
                    elif args.save_mode == 'last':
                        torch.save(
                            local_policy,
                            os.path.join(args.log_dir,
                                         args.save_name + '.pkl'))
                    elif args.save_mode == 'max':
                        if max_sum_rewards < sum_rewards:
                            torch.save(
                                local_policy,
                                os.path.join(args.log_dir,
                                             args.save_name + '.pkl'))
                            max_sum_rewards = sum_rewards
                    step = 0
                break
        else:
            _, v = local_policy(
                Variable(torch.from_numpy(o).unsqueeze(0).float()))
            R += v.data.squeeze()[0]

        returns = []
        for r in rewards[::-1]:
            R = r + 0.99 * R
            returns.insert(0, R)
        returns = torch.Tensor(returns)
        #if len(returns) > 1:
        #    returns = (returns-returns.mean()) / (returns.std()+args.eps)
        v_loss = 0
        entropy = 0
        for a, v, p, r in zip(actions, values, probs, returns):
            a.reinforce(r - v.data.squeeze())
            _v_loss = nn.MSELoss()(v, Variable(torch.Tensor([r])))
            v_loss += _v_loss
            entropy += -(p * (p + args.eps).log()).sum()
        v_loss = v_loss * 0.5 * args.v_loss_coeff
        entropy = entropy * args.entropy_beta
        loss = v_loss - entropy
        vs.append(v_loss.data.numpy())
        entropies.append(entropy.data.numpy())
        if rank == 0 and done:
            logger.record_tabular_misc_stat('Entropy', entropies)
            logger.record_tabular_misc_stat('V', vs)
            logger.record_tabular('reward', sum_rewards)
            logger.record_tabular('step', global_t[0])
            logger.dump_tabular()
            del vs[:]
            del entropies[:]
            sum_rewards = 0
        optimizer.zero_grad()
        final_node = [loss] + actions
        gradients = [torch.ones(1)] + [None] * len(actions)
        autograd.backward(final_node, gradients)
        new_lr = (args.epoch - global_t[0]) / args.epoch * args.lr
        optimizer.step(new_lr)
示例#26
0
            weights, batch_indxes = np.ones_like(rewards), None
            obses_t, obses_tp1 = tf.constant(obses_t), tf.constant(obses_tp1)
            actions, rewards, dones = tf.constant(
                actions,
                dtype=tf.int64), tf.constant(rewards), tf.constant(dones)
            weights = tf.constant(weights)

            td_errors = agent.train(obses_t, actions, rewards, obses_tp1,
                                    dones, weights)

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network periodically.
            agent.update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        # todo 每一个episode记录一次
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.dump_tabular()

    plt.figure()
    plt.plot(len(duration), duration)
    plt.figure()
    plt.plot(len(episode_rewards), episode_rewards)
    plt.show()
示例#27
0
def learn(
    env,
    policy_func,
    *,
    timesteps_per_batch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    U.load_state("save/Humanoid-v1")

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        #if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
        U.save_state("save/Humanoid-v1")
def retraining(
        save_path,
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=4,  #50
        nb_rollout_steps=3,  #100
        reward_scale=1.0,
        render=False,
        render_eval=False,
        #   noise_type='adaptive-param_0.2',
        noise_type='normal_0.2',
        #   noise_type='ou_0.9',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-4,
        #   actor_lr=1e-6,
        #   critic_lr=1e-5,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=3,  # per epoch cycle and MPI worker,  50
        nb_eval_steps=1,  #100
        batch_size=640,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=3,  #50
        **network_kwargs):

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    # nb_actions = env.action_space.shape[-1]
    nb_actions = env.num_actions

    # nb_actions=3
    # print(nb_actions)
    action_shape = np.array(nb_actions * [0]).shape

    #4 pairs pos + 3 link length
    # nb_features = 2*(env.num_actions+1)+env.num_actions

    #4 pairs pos + 1 pair target pos
    nb_features = 2 * (env.num_actions + 2)
    observation_shape = np.array(nb_features * [0]).shape
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6),
                    action_shape=action_shape,
                    observation_shape=observation_shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    # nb_actions = env.action_space.shape[-1]
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
    agent = DDPG(actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.initialize(sess)
    # sess.graph.finalize()

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    step_set = []
    reward_set = []

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    mean_epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    #load the initialization policy
    agent.load_ini(sess, save_path)
    # agent.memory.clear(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape)
    for epoch in range(nb_epochs):
        print(nb_epochs)
        # obs, env_state = env.reset()
        obs = env.reset()
        agent.save(save_path)
        epoch_episode_rewards = []
        '''check if the actor initialization policy has been loaded correctly, 
        i.e. equal to directly ouput values in checkpoint files '''
        # loaded_weights=tf.get_default_graph().get_tensor_by_name('target_actor/mlp_fc0/w:0')
        # print('loaded_weights:', sess.run(loaded_weights))
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.

            for t_rollout in range(nb_rollout_steps):
                # Predict next action
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)
                print('action:', action)

                new_obs, r, done = env.step(action)
                # time.sleep(0.2)
                t += 1

                episode_reward += r
                episode_step += 1
                # print('episode_re: ', episode_reward) #[1.]

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                b = 1.
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.

                obs = new_obs

            epoch_episode_rewards.append(episode_reward)
            episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)
                # print('Train!')
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards))
        # print(step_set,mean_epoch_episode_rewards)
        step_set.append(t)
        plt.plot(step_set,
                 mean_epoch_episode_rewards,
                 color='r',
                 label='Initialization')
        plt.xlabel('Steps')
        plt.ylabel('Mean Episode Reward')
        plt.savefig('ddpg_mean_retrain.png')
        # plt.show()

        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(
            np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)
    print('stepset: ', step_set)
    print('rewards: ', mean_epoch_episode_rewards)

    return agent
示例#29
0
文件: main.py 项目: msh0576/RL_WCPS
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          normalize_observations,
          actor_lr,
          critic_lr,
          action_noise,
          gamma,
          nb_train_steps,
          nb_rollout_steps,
          batch_size,
          memory,
          tau=0.01):

    max_action = env.action_space.high
    agent = DDPG(
        memory,
        env.observation_space.shape[0],
        env.action_space.shape[0],
        gamma=gamma,
        tau=tau,
        normalize_observations=normalize_observations,
        batch_size=batch_size,
        action_noise=action_noise,
        actor_lr=actor_lr,
        critic_lr=critic_lr,
    )
    if USE_CUDA:
        agent.cuda()
    # Set up logging stuff only for a single worker.
    step = 0
    episode = 0
    episode_rewards_history = deque(maxlen=100)
    # Prepare everything.

    agent.reset()
    obs = env.reset()
    done = False
    episode_reward = 0.
    episode_step = 0
    episodes = 0
    t = 0

    epoch = 0
    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_start_time = time.time()
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q = agent.pi(
                    obs, apply_noise=True,
                    compute_Q=True)  # policy 로 부터 action 을 선택하는
                assert action.shape == env.action_space.shape

                # Execute next action.
                assert max_action.shape == action.shape
                new_obs, r, done, info = env.step(max_action * action)  # 환경 스탭
                t += 1
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs

                if done:
                    # Episode done.
                    epoch_episode_rewards.append(episode_reward)
                    episode_rewards_history.append(episode_reward)
                    epoch_episode_steps.append(episode_step)
                    episode_reward = 0.
                    episode_step = 0
                    epoch_episodes += 1
                    episodes += 1

                    agent.reset()
                    obs = env.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            for t_train in range(nb_train_steps):
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        combined_stats = dict()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])
        logger.dump_tabular()
        logger.info('')
示例#30
0
def eval(data, model, meta_optimizer):

    model.eval()
    criterion = nn.NLLLoss().cuda()
    num_sents = 0
    num_words = 0
    total_nll_autoreg = 0.
    total_nll_vae = 0.
    total_kl_vae = 0.
    total_nll_svi = 0.
    total_kl_svi = 0.
    best_svi_loss = 0.
    for i in range(len(data)):
        sents, length, batch_size = data[i]
        num_words += batch_size * length
        num_sents += batch_size
        if args.gpu >= 0:
            sents = sents.cuda()
        if args.model == 'autoreg':
            preds = model._dec_forward(sents, None, True)
            nll_autoreg = sum([
                criterion(preds[:, l], sents[:, l + 1]) for l in range(length)
            ])
            total_nll_autoreg += nll_autoreg.data[0] * batch_size
        elif args.model == 'svi':
            mean_svi = Variable(
                0.1 * torch.randn(batch_size, args.latent_dim).cuda(),
                requires_grad=True)
            logvar_svi = Variable(
                0.1 * torch.randn(batch_size, args.latent_dim).cuda(),
                requires_grad=True)
            var_params_svi = meta_optimizer.forward([mean_svi, logvar_svi],
                                                    sents)
            mean_svi_final, logvar_svi_final = var_params_svi
            z_samples = model._reparameterize(mean_svi_final.detach(),
                                              logvar_svi_final.detach())
            preds = model._dec_forward(sents, z_samples)
            nll_svi = sum([
                criterion(preds[:, l], sents[:, l + 1]) for l in range(length)
            ])
            total_nll_svi += nll_svi.data[0] * batch_size
            kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final)
            total_kl_svi += kl_svi.data[0] * batch_size
            mean, logvar = mean_svi_final, logvar_svi_final
        else:
            mean, logvar = model._enc_forward(sents)
            z_samples = model._reparameterize(mean, logvar)
            preds = model._dec_forward(sents, z_samples)
            nll_vae = sum([
                criterion(preds[:, l], sents[:, l + 1]) for l in range(length)
            ])
            total_nll_vae += nll_vae.data[0] * batch_size
            kl_vae = utils.kl_loss_diag(mean, logvar)
            total_kl_vae += kl_vae.data[0] * batch_size
            if args.model == 'savae':
                mean_svi = Variable(mean.data, requires_grad=True)
                logvar_svi = Variable(logvar.data, requires_grad=True)
                var_params_svi = meta_optimizer.forward([mean_svi, logvar_svi],
                                                        sents)
                mean_svi_final, logvar_svi_final = var_params_svi
                z_samples = model._reparameterize(mean_svi_final,
                                                  logvar_svi_final)
                preds = model._dec_forward(sents, z_samples)
                nll_svi = sum([
                    criterion(preds[:, l], sents[:, l + 1])
                    for l in range(length)
                ])
                total_nll_svi += nll_svi.data[0] * batch_size
                kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final)
                total_kl_svi += kl_svi.data[0] * batch_size
                mean, logvar = mean_svi_final, logvar_svi_final

    nll_autoreg = total_nll_autoreg / num_sents
    ppl_autoreg = np.exp(total_nll_autoreg / num_words)
    nll_vae = (total_nll_vae + total_kl_vae) / num_sents
    rec_vae = total_nll_vae / num_sents
    kl_vae = total_kl_vae / num_sents
    ppl_bound_vae = np.exp((total_nll_vae + total_kl_vae) / num_words)
    nll_svi = (total_nll_svi + total_kl_svi) / num_sents
    rec_svi = total_nll_svi / num_sents
    kl_svi = total_kl_svi / num_sents
    ppl_bound_svi = np.exp((total_nll_svi + total_kl_svi) / num_words)

    logger.record_tabular('AR NLL', nll_autoreg)
    logger.record_tabular('AR PPL', ppl_autoreg)
    logger.record_tabular('VAE NLL', nll_vae)
    logger.record_tabular('VAE REC', rec_vae)
    logger.record_tabular('VAE KL', kl_vae)
    logger.record_tabular('VAE PPL', ppl_bound_vae)
    logger.record_tabular('SVI NLL', nll_svi)
    logger.record_tabular('SVI REC', rec_svi)
    logger.record_tabular('SVI KL', kl_svi)
    logger.record_tabular('SVI PPL', ppl_bound_svi)
    logger.dump_tabular()
    logger.info(
        'AR NLL: %.4f, AR PPL: %.4f, VAE NLL: %.4f, VAE REC: %.4f, VAE KL: %.4f, VAE PPL: %.4f, SVI NLL: %.4f, SVI REC: %.4f, SVI KL: %.4f, SVI PPL: %.4f'
        % (nll_autoreg, ppl_autoreg, nll_vae, rec_vae, kl_vae, ppl_bound_vae,
           nll_svi, rec_svi, kl_svi, ppl_bound_svi))
    model.train()
    if args.model == 'autoreg':
        return ppl_autoreg
    elif args.model == 'vae':
        return ppl_bound_vae
    elif args.model == 'savae' or args.model == 'svi':
        return ppl_bound_svi
示例#31
0
def learn(
        env,
        model_path,
        data_path,
        policy_fn,
        *,
        horizon=150,  # timesteps per actor per update
        rolloutSize=50,
        clip_param=0.2,
        entcoeff=0.02,  # clipping parameter epsilon, entropy coeff
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=32,  # optimization hypers
        gamma=0.99,
        lam=0.95,  # advantage estimation
        max_iters=0,  # time constraint
        adam_epsilon=1e-4,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        retrain=False):

    # Setup losses and policy
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=5)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=5)  # rolling buffer for episode rewards

    p = []  # for saving the rollouts

    if retrain == True:
        print("Retraining the policy from saved path")
        time.sleep(2)
        U.load_state(model_path)
    max_timesteps = int(horizon * rolloutSize * max_iters)

    while True:
        if max_iters and iters_so_far >= max_iters:
            break
        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)
        print("Collecting samples for policy optimization !! ")
        if iters_so_far > 70:
            render = True
        else:
            render = False
        rollouts = sample_trajectory(pi,
                                     env,
                                     horizon=horizon,
                                     rolloutSize=rolloutSize,
                                     stochastic=True,
                                     render=render)
        # Save rollouts
        data = {'rollouts': rollouts}
        p.append(data)
        del data
        data_file_name = data_path + 'rollout_data.pkl'
        pickle.dump(p, open(data_file_name, "wb"))

        add_vtarg_and_adv(rollouts, gamma, lam)

        ob, ac, atarg, tdlamret = rollouts["ob"], rollouts["ac"], rollouts[
            "adv"], rollouts["tdlamret"]
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    deterministic=pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)

        lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("Success", rollouts["success"])
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

    return pi
示例#32
0
def main(args):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_data = Dataset(args.train_file)
    val_data = Dataset(args.val_file)
    test_data = Dataset(args.test_file)
    train_sents = train_data.batch_size.sum()
    vocab_size = int(train_data.vocab_size)
    logger.info('Train data: %d batches' % len(train_data))
    logger.info('Val data: %d batches' % len(val_data))
    logger.info('Test data: %d batches' % len(test_data))
    logger.info('Word vocab size: %d' % vocab_size)

    checkpoint_dir = args.checkpoint_dir
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    suffix = "%s_%s.pt" % (args.model, 'cyc')
    checkpoint_path = os.path.join(checkpoint_dir, suffix)

    if args.slurm == 0:
        cuda.set_device(args.gpu)
    if args.train_from == '':
        model = RNNVAE(vocab_size=vocab_size,
                       enc_word_dim=args.enc_word_dim,
                       enc_h_dim=args.enc_h_dim,
                       enc_num_layers=args.enc_num_layers,
                       dec_word_dim=args.dec_word_dim,
                       dec_h_dim=args.dec_h_dim,
                       dec_num_layers=args.dec_num_layers,
                       dec_dropout=args.dec_dropout,
                       latent_dim=args.latent_dim,
                       mode=args.model)
        for param in model.parameters():
            param.data.uniform_(-0.1, 0.1)
    else:
        logger.info('loading model from ' + args.train_from)
        checkpoint = torch.load(args.train_from)
        model = checkpoint['model']

    logger.info("model architecture")
    print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    if args.warmup == 0:
        args.beta = 1.
    else:
        args.beta = 0.1

    criterion = nn.NLLLoss()
    model.cuda()
    criterion.cuda()
    model.train()

    def variational_loss(input, sents, model, z=None):
        mean, logvar = input
        z_samples = model._reparameterize(mean, logvar, z)
        preds = model._dec_forward(sents, z_samples)
        nll = sum([
            criterion(preds[:, l], sents[:, l + 1])
            for l in range(preds.size(1))
        ])
        kl = utils.kl_loss_diag(mean, logvar)
        return nll + args.beta * kl

    update_params = list(model.dec.parameters())
    meta_optimizer = OptimN2N(variational_loss,
                              model,
                              update_params,
                              eps=args.eps,
                              lr=[args.svi_lr1, args.svi_lr2],
                              iters=args.svi_steps,
                              momentum=args.momentum,
                              acc_param_grads=args.train_n2n == 1,
                              max_grad_norm=args.svi_max_grad_norm)
    if args.test == 1:
        args.beta = 1
        test_data = Dataset(args.test_file)
        eval(test_data, model, meta_optimizer)
        exit()

    t = 0
    best_val_nll = 1e5
    best_epoch = 0
    val_stats = []
    epoch = 0
    while epoch < args.num_epochs:
        start_time = time.time()
        epoch += 1
        logger.info('Starting epoch %d' % epoch)
        train_nll_vae = 0.
        train_nll_autoreg = 0.
        train_kl_vae = 0.
        train_nll_svi = 0.
        train_kl_svi = 0.
        train_kl_init_final = 0.
        num_sents = 0
        num_words = 0
        b = 0

        tmp = float((epoch - 1) % args.cycle) / args.cycle
        cur_lr = args.lr * 0.5 * (1 + np.cos(tmp * np.pi))
        for param_group in optimizer.param_groups:
            param_group['lr'] = cur_lr

        if (epoch - 1) % args.cycle == 0:
            args.beta = 0.1
            logger.info('KL annealing restart')

        for i in np.random.permutation(len(train_data)):
            if args.warmup > 0:
                args.beta = min(
                    1, args.beta + 1. / (args.warmup * len(train_data)))

            sents, length, batch_size = train_data[i]
            if args.gpu >= 0:
                sents = sents.cuda()
            b += 1

            optimizer.zero_grad()
            if args.model == 'autoreg':
                preds = model._dec_forward(sents, None, True)
                nll_autoreg = sum([
                    criterion(preds[:, l], sents[:, l + 1])
                    for l in range(length)
                ])
                train_nll_autoreg += nll_autoreg.data[0] * batch_size
                nll_autoreg.backward()
            elif args.model == 'svi':
                mean_svi = Variable(
                    0.1 * torch.zeros(batch_size, args.latent_dim).cuda(),
                    requires_grad=True)
                logvar_svi = Variable(
                    0.1 * torch.zeros(batch_size, args.latent_dim).cuda(),
                    requires_grad=True)
                var_params_svi = meta_optimizer.forward(
                    [mean_svi, logvar_svi], sents, b % args.print_every == 0)
                mean_svi_final, logvar_svi_final = var_params_svi
                z_samples = model._reparameterize(mean_svi_final.detach(),
                                                  logvar_svi_final.detach())
                preds = model._dec_forward(sents, z_samples)
                nll_svi = sum([
                    criterion(preds[:, l], sents[:, l + 1])
                    for l in range(length)
                ])
                train_nll_svi += nll_svi.data[0] * batch_size
                kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final)
                train_kl_svi += kl_svi.data[0] * batch_size
                var_loss = nll_svi + args.beta * kl_svi
                var_loss.backward(retain_graph=True)
            else:
                mean, logvar = model._enc_forward(sents)
                z_samples = model._reparameterize(mean, logvar)
                preds = model._dec_forward(sents, z_samples)
                nll_vae = sum([
                    criterion(preds[:, l], sents[:, l + 1])
                    for l in range(length)
                ])
                train_nll_vae += nll_vae.data[0] * batch_size
                kl_vae = utils.kl_loss_diag(mean, logvar)
                train_kl_vae += kl_vae.data[0] * batch_size
                if args.model == 'vae':
                    vae_loss = nll_vae + args.beta * kl_vae
                    vae_loss.backward(retain_graph=True)
                if args.model == 'savae':
                    var_params = torch.cat([mean, logvar], 1)
                    mean_svi = Variable(mean.data, requires_grad=True)
                    logvar_svi = Variable(logvar.data, requires_grad=True)
                    var_params_svi = meta_optimizer.forward(
                        [mean_svi, logvar_svi], sents,
                        b % args.print_every == 0)
                    mean_svi_final, logvar_svi_final = var_params_svi
                    z_samples = model._reparameterize(mean_svi_final,
                                                      logvar_svi_final)
                    preds = model._dec_forward(sents, z_samples)
                    nll_svi = sum([
                        criterion(preds[:, l], sents[:, l + 1])
                        for l in range(length)
                    ])
                    train_nll_svi += nll_svi.data[0] * batch_size
                    kl_svi = utils.kl_loss_diag(mean_svi_final,
                                                logvar_svi_final)
                    train_kl_svi += kl_svi.data[0] * batch_size
                    var_loss = nll_svi + args.beta * kl_svi
                    var_loss.backward(retain_graph=True)
                    if args.train_n2n == 0:
                        if args.train_kl == 1:
                            mean_final = mean_svi_final.detach()
                            logvar_final = logvar_svi_final.detach()
                            kl_init_final = utils.kl_loss(
                                mean, logvar, mean_final, logvar_final)
                            train_kl_init_final += kl_init_final.data[
                                0] * batch_size
                            kl_init_final.backward(retain_graph=True)
                        else:
                            vae_loss = nll_vae + args.beta * kl_vae
                            var_param_grads = torch.autograd.grad(
                                vae_loss, [mean, logvar], retain_graph=True)
                            var_param_grads = torch.cat(var_param_grads, 1)
                            var_params.backward(var_param_grads,
                                                retain_graph=True)
                    else:
                        var_param_grads = meta_optimizer.backward(
                            [mean_svi_final.grad, logvar_svi_final.grad],
                            b % args.print_every == 0)
                        var_param_grads = torch.cat(var_param_grads, 1)
                        var_params.backward(var_param_grads)
            if args.max_grad_norm > 0:
                torch.nn.utils.clip_grad_norm(model.parameters(),
                                              args.max_grad_norm)
            optimizer.step()
            num_sents += batch_size
            num_words += batch_size * length

            if b % args.print_every == 0:
                param_norm = sum([p.norm()**2
                                  for p in model.parameters()]).data[0]**0.5
                logger.info(
                    'Iters: %d, Epoch: %d, Batch: %d/%d, LR: %.4f, TrainARNLL: %.4f, TrainARPPL: %.2f, TrainVAE_NLL: %.4f, TrainVAE_REC: %.4f, TrainVAE_KL: %.4f, TrainVAE_PPL: %.2f, TrainSVI_NLL: %.2f, TrainSVI_REC: %.2f, TrainSVI_KL: %.4f, TrainSVI_PPL: %.2f, KLInitFinal: %.2f, |Param|: %.4f, BestValPerf: %.2f, BestEpoch: %d, Beta: %.4f, Throughput: %.2f examples/sec'
                    % (t, epoch, b + 1, len(train_data), cur_lr,
                       train_nll_autoreg / num_sents,
                       np.exp(train_nll_autoreg / num_words),
                       (train_nll_vae + train_kl_vae) / num_sents,
                       train_nll_vae / num_sents, train_kl_vae / num_sents,
                       np.exp((train_nll_vae + train_kl_vae) / num_words),
                       (train_nll_svi + train_kl_svi) / num_sents,
                       train_nll_svi / num_sents, train_kl_svi / num_sents,
                       np.exp((train_nll_svi + train_kl_svi) / num_words),
                       train_kl_init_final / num_sents, param_norm,
                       best_val_nll, best_epoch, args.beta, num_sents /
                       (time.time() - start_time)))

        epoch_train_time = time.time() - start_time
        logger.info('Time Elapsed: %.1fs' % epoch_train_time)

        logger.info('--------------------------------')
        logger.info('Checking validation perf...')
        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('Mode', 'Val')
        logger.record_tabular('LR', cur_lr)
        logger.record_tabular('Epoch Train Time', epoch_train_time)
        val_nll = eval(val_data, model, meta_optimizer)
        val_stats.append(val_nll)

        logger.info('--------------------------------')
        logger.info('Checking test perf...')
        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('Mode', 'Test')
        logger.record_tabular('LR', cur_lr)
        logger.record_tabular('Epoch Train Time', epoch_train_time)
        test_nll = eval(test_data, model, meta_optimizer)

        if val_nll < best_val_nll:
            best_val_nll = val_nll
            best_epoch = epoch
            model.cpu()
            checkpoint = {
                'args': args.__dict__,
                'model': model,
                'val_stats': val_stats
            }
            logger.info('Save checkpoint to %s' % checkpoint_path)
            torch.save(checkpoint, checkpoint_path)
            model.cuda()
        else:
            if epoch >= args.min_epochs:
                args.decay = 1