def main(env, scene_path):
    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)
    save_path = os.path.join(args.save_dir, args.algo)

    eval_x = []
    eval_y = []

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    initial_policies = torch.load(os.path.join(args.load_dir, args.algo,
                                               args.initial_policy + ".pt")) \
        if args.initial_policy else None

    if args.reuse_residual:
        residual, ob_rms, initial_policies = initial_policies
    else:
        residual = None
        ob_rms = None

    pose_estimator = torch.load(os.path.join(args.load_dir, "pe",
                                             args.pose_estimator + ".pt")) \
        if args.pose_estimator else None

    envs = make_vec_envs(env,
                         scene_path,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         device,
                         False,
                         initial_policies,
                         pose_estimator=pose_estimator,
                         init_control=not args.dense_ip)
    if args.reuse_residual:
        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms

    base_kwargs = {'recurrent': args.recurrent_policy}
    base = residual.base if args.reuse_residual else None
    dist = residual.dist if args.reuse_residual else None
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs=base_kwargs,
                          zero_last_layer=True,
                          base=base,
                          dist=dist)
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm,
                         burn_in=initial_policies is not None
                         and not args.reuse_residual)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=64)

    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    total_num_steps = 0
    j = 0
    max_succ = -1
    max_mean_rew = -math.inf
    mean_ep_rew = -math.inf
    evals_without_improv = 0

    start = time.time()
    start_update = start
    while (not use_metric
           and j < num_updates) or (use_metric
                                    and max_succ < args.trg_succ_rate):
        if args.eval_interval is not None and j % args.eval_interval == 0:
            print("Evaluating current policy...")
            i = 0
            total_successes = 0
            max_trials = 50
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)
            while i + args.num_processes <= max_trials:

                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                obs, _, dones, infos = envs.step(action)

                if np.all(dones):  # Rigid - assumes episodes are fixed length
                    rews = []
                    for info in infos:
                        rews.append(info['rew_success'])
                    i += args.num_processes
                    rew = sum([int(rew > 0) for rew in rews])
                    total_successes += rew

            p_succ = (100 * total_successes / i)
            eval_x += [total_num_steps]
            eval_y += [p_succ]

            end = time.time()
            print(
                f"Evaluation: {total_successes} successful out of {i} episodes - "
                f"{p_succ:.2f}% successful. Eval length: {end - start_update}")
            torch.save([eval_x, eval_y],
                       os.path.join(args.save_as + "_eval.pt"))
            start_update = end

            if p_succ > max_succ:
                max_succ = p_succ
                max_mean_rew = mean_ep_rew
                evals_without_improv = 0
            elif mean_ep_rew > max_mean_rew:
                print("Unimproved success rate, higher reward")
                max_mean_rew = mean_ep_rew
                evals_without_improv = 0
            else:
                evals_without_improv += 1

            if evals_without_improv == 10 or max_succ >= args.trg_succ_rate:
                save_model = actor_critic
                if args.cuda:
                    save_model = copy.deepcopy(actor_critic).cpu()

                save_model = [
                    save_model,
                    getattr(get_vec_normalize(envs), 'ob_rms', None),
                    initial_policies
                ]
                extra = "_final" if evals_without_improv == 5 else ""
                torch.save(
                    save_model,
                    os.path.join(save_path, args.save_as + f"{extra}.pt"))
                break

        # save for every interval-th episode or for the last epoch
        if ((not use_metric and
             (j % args.save_interval == 0 or j == num_updates - 1)) or
            (use_metric
             and evals_without_improv == 0)) and args.save_dir != "":
            os.makedirs(save_path, exist_ok=True)

            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            if pose_estimator is not None:
                save_model = [save_model, pose_estimator, initial_policies]
            else:
                save_model = [
                    save_model,
                    getattr(get_vec_normalize(envs), 'ob_rms', None),
                    initial_policies
                ]

            torch.save(save_model, os.path.join(save_path,
                                                args.save_as + ".pt"))
            # torch.save(save_model, os.path.join(save_path, args.save_as + f"{j * args.num_processes * args.num_steps}.pt"))

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            mean_ep_rew = np.mean(episode_rewards)
            if mean_ep_rew > max_mean_rew:
                print("Improved max mean reward")
                max_mean_rew = mean_ep_rew
                evals_without_improv = 0
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), mean_ep_rew,
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            print("Update length: ", end - start_update)
            start_update = end

        if args.vis and (j % args.vis_interval == 0 or
                         (not use_metric and j == num_updates - 1)):
            try:
                # Sometimes monitor doesn't properly flush the outputs
                visdom_plot(args.log_dir, args.save_as, args.algo,
                            total_num_steps)
            except IOError:
                pass

        j += 1

    if use_metric:
        if max_succ >= args.trg_succ_rate:
            print(
                f"Achieved greater than {args.trg_succ_rate}% success, advancing curriculum."
            )
        else:
            print(
                f"Policy converged with max success rate < {args.trg_succ_rate}%"
            )
    # Copy logs to permanent location so new graphs can be drawn.
    copy_tree(args.log_dir, os.path.join('logs', args.save_as))
    envs.close()
    return total_num_steps
Пример #2
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_env_steps)
            except IOError:
                pass
Пример #3
0
def onpolicy_main():
    print("onpolicy main")

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    summary_name = args.log_dir + '{0}_{1}'
    writer = SummaryWriter(summary_name.format(args.env_name, args.save_name))

    # Make vector env
    envs = make_vec_envs(
        args.env_name,
        args.seed,
        args.num_processes,
        args.gamma,
        args.log_dir,
        device,
        False,
        env_kwargs=env_kwargs,
    )

    # agly ways to access to the environment attirubutes
    if args.env_name.find('doorenv') > -1:
        if args.num_processes > 1:
            visionnet_input = envs.venv.venv.visionnet_input
            nn = envs.venv.venv.nn
            env_name = envs.venv.venv.xml_path
        else:
            visionnet_input = envs.venv.venv.envs[
                0].env.env.env.visionnet_input
            nn = envs.venv.venv.envs[0].env.env.env.nn
            env_name = envs.venv.venv.envs[0].env.env.env.xml_path
        dummy_obs = np.zeros(nn * 2 + 3)
    else:
        dummy_obs = envs.observation_space
        visionnet_input = None
        nn = None

    if pretrained_policy_load:
        print("loading", pretrained_policy_load)
        actor_critic, ob_rms = torch.load(pretrained_policy_load)
    else:
        actor_critic = Policy(dummy_obs.shape,
                              envs.action_space,
                              base_kwargs={'recurrent': args.recurrent_policy})

    if visionnet_input:
        visionmodel = load_visionmodel(env_name, args.visionmodel_path,
                                       VisionModelXYZ())
        actor_critic.visionmodel = visionmodel.eval()
    actor_critic.nn = nn
    actor_critic.to(device)

    #disable normalizer
    vec_norm = get_vec_normalize(envs)
    vec_norm.eval()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              dummy_obs.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    full_obs = envs.reset()
    initial_state = full_obs[:, :envs.action_space.shape[0]]

    if args.env_name.find('doorenv') > -1 and visionnet_input:
        obs = actor_critic.obs2inputs(full_obs, 0)
    else:
        if knob_noisy:
            obs = add_noise(full_obs, 0)
        else:
            obs = full_obs

    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(agent.optimizer, j, num_updates,
                                         args.lr)

        pos_control = False
        total_switches = 0
        prev_selection = ""
        for step in range(args.num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
                next_action = action

            if pos_control:
                frame_skip = 2
                if step % (512 / frame_skip - 1) == 0:
                    current_state = initial_state
                next_action = current_state + next_action
                for kk in range(frame_skip):
                    full_obs, reward, done, infos = envs.step(next_action)

                current_state = full_obs[:, :envs.action_space.shape[0]]
            else:
                full_obs, reward, done, infos = envs.step(next_action)

            # convert img to obs if door_env and using visionnet
            if args.env_name.find('doorenv') > -1 and visionnet_input:
                obs = actor_critic.obs2inputs(full_obs, j)
            else:
                if knob_noisy:
                    obs = add_noise(full_obs, j)
                else:
                    obs = full_obs

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        writer.add_scalar("Value loss", value_loss, j)
        writer.add_scalar("action loss", action_loss, j)
        writer.add_scalar("dist entropy loss", dist_entropy, j)
        writer.add_scalar("Episode rewards", np.mean(episode_rewards), j)

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ],
                       os.path.join(
                           save_path, args.env_name +
                           "_{}.{}.pt".format(args.save_name, j)))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)

        DR = True  #Domain Randomization
        ################## for multiprocess world change ######################
        if DR:
            print("changing world")

            envs.close_extras()
            envs.close()
            del envs

            envs = make_vec_envs(
                args.env_name,
                args.seed,
                args.num_processes,
                args.gamma,
                args.log_dir,
                device,
                False,
                env_kwargs=env_kwargs,
            )

            full_obs = envs.reset()
            if args.env_name.find('doorenv') > -1 and visionnet_input:
                obs = actor_critic.obs2inputs(full_obs, j)
            else:
                obs = full_obs
Пример #4
0
    def train_model(self):
        episode_rewards = deque(maxlen=10)
        current_episode_rewards = np.zeros(self.shell_args.num_processes)
        episode_lengths = deque(maxlen=10)
        current_episode_lengths = np.zeros(self.shell_args.num_processes)
        current_rewards = np.zeros(self.shell_args.num_processes)

        total_num_steps = self.start_iter
        fps_timer = [time.time(), total_num_steps]
        timers = np.zeros(3)
        egomotion_loss = 0

        video_frames = []
        num_episodes = 0
        # self.evaluate_model()

        obs = self.envs.reset()
        if self.compute_surface_normals:
            obs["surface_normals"] = pt_util.depth_to_surface_normals(
                obs["depth"].to(self.device))
        obs["prev_action_one_hot"] = obs[
            "prev_action_one_hot"][:, ACTION_SPACE].to(torch.float32)
        if self.shell_args.algo == "supervised":
            obs["best_next_action"] = pt_util.from_numpy(
                obs["best_next_action"][:, ACTION_SPACE])
        self.rollouts.copy_obs(obs, 0)
        distances = pt_util.to_numpy_array(obs["goal_geodesic_distance"])
        self.train_stats["start_geodesic_distance"][:] = distances
        previous_visual_features = None
        egomotion_pred = None
        prev_action = None
        prev_action_probs = None
        num_updates = (int(self.shell_args.num_env_steps) //
                       self.shell_args.num_forward_rollout_steps
                       ) // self.shell_args.num_processes

        try:
            for iter_count in range(num_updates):
                if self.shell_args.tensorboard:
                    if iter_count % 500 == 0:
                        print("Logging conv summaries")
                        self.logger.network_conv_summary(
                            self.agent, total_num_steps)
                    elif iter_count % 100 == 0:
                        print("Logging variable summaries")
                        self.logger.network_variable_summary(
                            self.agent, total_num_steps)

                if self.shell_args.use_linear_lr_decay:
                    # decrease learning rate linearly
                    update_linear_schedule(self.optimizer.optimizer,
                                           iter_count, num_updates,
                                           self.shell_args.lr)

                if self.shell_args.algo == "ppo" and self.shell_args.use_linear_clip_decay:
                    self.optimizer.clip_param = self.shell_args.clip_param * (
                        1 - iter_count / float(num_updates))

                if hasattr(self.agent.base, "enable_decoder"):
                    if self.shell_args.record_video:
                        self.agent.base.enable_decoder()
                    else:
                        self.agent.base.disable_decoder()

                for step in range(self.shell_args.num_forward_rollout_steps):
                    with torch.no_grad():
                        start_t = time.time()
                        value, action, action_log_prob, recurrent_hidden_states = self.agent.act(
                            {
                                "images":
                                self.rollouts.obs[step],
                                "target_vector":
                                self.rollouts.additional_observations_dict[
                                    "pointgoal"][step],
                                "prev_action_one_hot":
                                self.rollouts.additional_observations_dict[
                                    "prev_action_one_hot"][step],
                            },
                            self.rollouts.recurrent_hidden_states[step],
                            self.rollouts.masks[step],
                        )
                        action_cpu = pt_util.to_numpy_array(action.squeeze(1))
                        translated_action_space = ACTION_SPACE[action_cpu]
                        if not self.shell_args.end_to_end:
                            self.rollouts.additional_observations_dict[
                                "visual_encoder_features"][
                                    self.rollouts.step].copy_(
                                        self.agent.base.visual_encoder_features
                                    )

                        if self.shell_args.use_motion_loss:
                            if self.shell_args.record_video:
                                if previous_visual_features is not None:
                                    egomotion_pred = self.agent.base.predict_egomotion(
                                        self.agent.base.visual_features,
                                        previous_visual_features)
                            previous_visual_features = self.agent.base.visual_features.detach(
                            )

                        timers[1] += time.time() - start_t

                        if self.shell_args.record_video:
                            # Copy so we don't mess with obs itself
                            draw_obs = OrderedDict()
                            for key, val in obs.items():
                                draw_obs[key] = pt_util.to_numpy_array(
                                    val).copy()
                            best_next_action = draw_obs.pop(
                                "best_next_action", None)

                            if prev_action is not None:
                                draw_obs[
                                    "action_taken"] = pt_util.to_numpy_array(
                                        self.agent.last_dist.probs).copy()
                                draw_obs["action_taken"][:] = 0
                                draw_obs["action_taken"][
                                    np.arange(self.shell_args.num_processes),
                                    prev_action] = 1
                                draw_obs[
                                    "action_taken_name"] = SIM_ACTION_TO_NAME[
                                        ACTION_SPACE_TO_SIM_ACTION[
                                            ACTION_SPACE[
                                                prev_action.squeeze()]]]
                                draw_obs[
                                    "action_prob"] = pt_util.to_numpy_array(
                                        prev_action_probs).copy()
                            else:
                                draw_obs["action_taken"] = None
                                draw_obs[
                                    "action_taken_name"] = SIM_ACTION_TO_NAME[
                                        SimulatorActions.STOP]
                                draw_obs["action_prob"] = None
                            prev_action = action_cpu
                            prev_action_probs = self.agent.last_dist.probs.detach(
                            )
                            if (hasattr(self.agent.base, "decoder_outputs")
                                    and self.agent.base.decoder_outputs
                                    is not None):
                                min_channel = 0
                                for key, num_channels in self.agent.base.decoder_output_info:
                                    outputs = self.agent.base.decoder_outputs[:,
                                                                              min_channel:
                                                                              min_channel
                                                                              +
                                                                              num_channels,
                                                                              ...]
                                    draw_obs["output_" +
                                             key] = pt_util.to_numpy_array(
                                                 outputs).copy()
                                    min_channel += num_channels
                            draw_obs["rewards"] = current_rewards.copy()
                            draw_obs["step"] = current_episode_lengths.copy()
                            draw_obs["method"] = self.shell_args.method_name
                            if best_next_action is not None:
                                draw_obs["best_next_action"] = best_next_action
                            if self.shell_args.use_motion_loss:
                                if egomotion_pred is not None:
                                    draw_obs[
                                        "egomotion_pred"] = pt_util.to_numpy_array(
                                            F.softmax(egomotion_pred,
                                                      dim=1)).copy()
                                else:
                                    draw_obs["egomotion_pred"] = None
                            images, titles, normalize = draw_outputs.obs_to_images(
                                draw_obs)
                            if self.shell_args.algo == "supervised":
                                im_inds = [0, 2, 3, 1, 9, 6, 7, 8, 5, 4]
                            else:
                                im_inds = [0, 2, 3, 1, 6, 7, 8, 5]
                            height, width = images[0].shape[:2]
                            subplot_image = drawing.subplot(
                                images,
                                2,
                                5,
                                titles=titles,
                                normalize=normalize,
                                order=im_inds,
                                output_width=max(width, 320),
                                output_height=max(height, 320),
                            )
                            video_frames.append(subplot_image)

                        # save dists from previous step or else on reset they will be overwritten
                        distances = pt_util.to_numpy_array(
                            obs["goal_geodesic_distance"])

                        start_t = time.time()
                        obs, rewards, dones, infos = self.envs.step(
                            translated_action_space)
                        timers[0] += time.time() - start_t
                        obs["reward"] = rewards
                        if self.shell_args.algo == "supervised":
                            obs["best_next_action"] = pt_util.from_numpy(
                                obs["best_next_action"][:, ACTION_SPACE]).to(
                                    torch.float32)
                        obs["prev_action_one_hot"] = obs[
                            "prev_action_one_hot"][:, ACTION_SPACE].to(
                                torch.float32)
                        rewards *= REWARD_SCALAR
                        rewards = np.clip(rewards, -10, 10)

                        if self.shell_args.record_video and not dones[0]:
                            obs["top_down_map"] = infos[0]["top_down_map"]

                        if self.compute_surface_normals:
                            obs["surface_normals"] = pt_util.depth_to_surface_normals(
                                obs["depth"].to(self.device))

                        current_rewards = pt_util.to_numpy_array(rewards)
                        current_episode_rewards += pt_util.to_numpy_array(
                            rewards).squeeze()
                        current_episode_lengths += 1
                        for ii, done_e in enumerate(dones):
                            if done_e:
                                num_episodes += 1
                                if self.shell_args.record_video:
                                    final_rgb = draw_obs["rgb"].transpose(
                                        0, 2, 3, 1).squeeze(0)
                                    if self.shell_args.task == "pointnav":
                                        if infos[ii]["spl"] > 0:
                                            draw_obs[
                                                "action_taken_name"] = "Stop. Success"
                                            draw_obs["reward"] = [
                                                self.configs[0].TASK.
                                                SUCCESS_REWARD
                                            ]
                                            final_rgb[:] = final_rgb * np.float32(
                                                0.5) + np.tile(
                                                    np.array([0, 128, 0],
                                                             dtype=np.uint8),
                                                    (final_rgb.shape[0],
                                                     final_rgb.shape[1], 1),
                                                )
                                        else:
                                            draw_obs[
                                                "action_taken_name"] = "Timeout. Failed"
                                            final_rgb[:] = final_rgb * np.float32(
                                                0.5) + np.tile(
                                                    np.array([128, 0, 0],
                                                             dtype=np.uint8),
                                                    (final_rgb.shape[0],
                                                     final_rgb.shape[1], 1),
                                                )
                                    elif self.shell_args.task == "exploration" or self.shell_args.task == "flee":
                                        draw_obs[
                                            "action_taken_name"] = "End of episode."
                                    final_rgb = final_rgb[np.newaxis,
                                                          ...].transpose(
                                                              0, 3, 1, 2)
                                    draw_obs["rgb"] = final_rgb

                                    images, titles, normalize = draw_outputs.obs_to_images(
                                        draw_obs)
                                    im_inds = [0, 2, 3, 1, 6, 7, 8, 5]
                                    height, width = images[0].shape[:2]
                                    subplot_image = drawing.subplot(
                                        images,
                                        2,
                                        5,
                                        titles=titles,
                                        normalize=normalize,
                                        order=im_inds,
                                        output_width=max(width, 320),
                                        output_height=max(height, 320),
                                    )
                                    video_frames.extend(
                                        [subplot_image] *
                                        (self.configs[0].ENVIRONMENT.
                                         MAX_EPISODE_STEPS + 30 -
                                         len(video_frames)))

                                    if "top_down_map" in infos[0]:
                                        video_dir = os.path.join(
                                            self.shell_args.log_prefix,
                                            "videos")
                                        if not os.path.exists(video_dir):
                                            os.makedirs(video_dir)
                                        im_path = os.path.join(
                                            self.shell_args.log_prefix,
                                            "videos", "total_steps_%d.png" %
                                            total_num_steps)
                                        from habitat.utils.visualizations import maps
                                        import imageio

                                        top_down_map = maps.colorize_topdown_map(
                                            infos[0]["top_down_map"]["map"])
                                        imageio.imsave(im_path, top_down_map)

                                    images_to_video(
                                        video_frames,
                                        os.path.join(
                                            self.shell_args.log_prefix,
                                            "videos"),
                                        "total_steps_%d" % total_num_steps,
                                    )
                                    video_frames = []

                                if self.shell_args.task == "pointnav":
                                    print(
                                        "FINISHED EPISODE %d Length %d Reward %.3f SPL %.4f"
                                        % (
                                            num_episodes,
                                            current_episode_lengths[ii],
                                            current_episode_rewards[ii],
                                            infos[ii]["spl"],
                                        ))
                                    self.train_stats["spl"][ii] = infos[ii][
                                        "spl"]
                                    self.train_stats["success"][
                                        ii] = self.train_stats["spl"][ii] > 0
                                    self.train_stats["end_geodesic_distance"][
                                        ii] = (distances[ii] - self.configs[0].
                                               SIMULATOR.FORWARD_STEP_SIZE)
                                    self.train_stats[
                                        "delta_geodesic_distance"][ii] = (
                                            self.train_stats[
                                                "start_geodesic_distance"][ii]
                                            - self.train_stats[
                                                "end_geodesic_distance"][ii])
                                    self.train_stats["num_steps"][
                                        ii] = current_episode_lengths[ii]
                                elif self.shell_args.task == "exploration":
                                    print(
                                        "FINISHED EPISODE %d Reward %.3f States Visited %d"
                                        % (num_episodes,
                                           current_episode_rewards[ii],
                                           infos[ii]["visited_states"]))
                                    self.train_stats["visited_states"][
                                        ii] = infos[ii]["visited_states"]
                                elif self.shell_args.task == "flee":
                                    print(
                                        "FINISHED EPISODE %d Reward %.3f Distance from start %.4f"
                                        % (num_episodes,
                                           current_episode_rewards[ii],
                                           infos[ii]["distance_from_start"]))
                                    self.train_stats["distance_from_start"][
                                        ii] = infos[ii]["distance_from_start"]

                                self.train_stats["num_episodes"][ii] += 1
                                self.train_stats["reward"][
                                    ii] = current_episode_rewards[ii]

                                if self.shell_args.tensorboard:
                                    log_dict = {
                                        "single_episode/reward":
                                        self.train_stats["reward"][ii]
                                    }
                                    if self.shell_args.task == "pointnav":
                                        log_dict.update({
                                            "single_episode/num_steps":
                                            self.train_stats["num_steps"][ii],
                                            "single_episode/spl":
                                            self.train_stats["spl"][ii],
                                            "single_episode/success":
                                            self.train_stats["success"][ii],
                                            "single_episode/start_geodesic_distance":
                                            self.train_stats[
                                                "start_geodesic_distance"][ii],
                                            "single_episode/end_geodesic_distance":
                                            self.train_stats[
                                                "end_geodesic_distance"][ii],
                                            "single_episode/delta_geodesic_distance":
                                            self.train_stats[
                                                "delta_geodesic_distance"][ii],
                                        })
                                    elif self.shell_args.task == "exploration":
                                        log_dict[
                                            "single_episode/visited_states"] = self.train_stats[
                                                "visited_states"][ii]
                                    elif self.shell_args.task == "flee":
                                        log_dict[
                                            "single_episode/distance_from_start"] = self.train_stats[
                                                "distance_from_start"][ii]
                                    self.logger.dict_log(
                                        log_dict,
                                        step=(total_num_steps +
                                              self.shell_args.num_processes *
                                              step + ii))

                                episode_rewards.append(
                                    current_episode_rewards[ii])
                                current_episode_rewards[ii] = 0
                                episode_lengths.append(
                                    current_episode_lengths[ii])
                                current_episode_lengths[ii] = 0
                                self.train_stats["start_geodesic_distance"][
                                    ii] = obs["goal_geodesic_distance"][ii]

                        # If done then clean the history of observations.
                        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                   for done_ in dones])
                        bad_masks = torch.FloatTensor(
                            [[0.0]
                             if "bad_transition" in info.keys() else [1.0]
                             for info in infos])

                        self.rollouts.insert(obs, recurrent_hidden_states,
                                             action, action_log_prob, value,
                                             rewards, masks, bad_masks)

                with torch.no_grad():
                    start_t = time.time()
                    next_value = self.agent.get_value(
                        {
                            "images":
                            self.rollouts.obs[-1],
                            "target_vector":
                            self.rollouts.
                            additional_observations_dict["pointgoal"][-1],
                            "prev_action_one_hot":
                            self.rollouts.additional_observations_dict[
                                "prev_action_one_hot"][-1],
                        },
                        self.rollouts.recurrent_hidden_states[-1],
                        self.rollouts.masks[-1],
                    ).detach()
                    timers[1] += time.time() - start_t

                self.rollouts.compute_returns(next_value,
                                              self.shell_args.use_gae,
                                              self.shell_args.gamma,
                                              self.shell_args.tau)

                if not self.shell_args.no_weight_update:
                    start_t = time.time()
                    if self.shell_args.algo == "supervised":
                        (
                            total_loss,
                            action_loss,
                            visual_loss_total,
                            visual_loss_dict,
                            egomotion_loss,
                            forward_model_loss,
                        ) = self.optimizer.update(self.rollouts,
                                                  self.shell_args)
                    else:
                        (
                            total_loss,
                            value_loss,
                            action_loss,
                            dist_entropy,
                            visual_loss_total,
                            visual_loss_dict,
                            egomotion_loss,
                            forward_model_loss,
                        ) = self.optimizer.update(self.rollouts,
                                                  self.shell_args)

                    timers[2] += time.time() - start_t

                self.rollouts.after_update()

                # save for every interval-th episode or for the last epoch
                if iter_count % self.shell_args.save_interval == 0 or iter_count == num_updates - 1:
                    self.save_checkpoint(5, total_num_steps)

                total_num_steps += self.shell_args.num_processes * self.shell_args.num_forward_rollout_steps

                if not self.shell_args.no_weight_update and iter_count % self.shell_args.log_interval == 0:
                    log_dict = {}
                    if len(episode_rewards) > 1:
                        end = time.time()
                        nsteps = total_num_steps - fps_timer[1]
                        fps = int((total_num_steps - fps_timer[1]) /
                                  (end - fps_timer[0]))
                        timers /= nsteps
                        env_spf = timers[0]
                        forward_spf = timers[1]
                        backward_spf = timers[2]
                        print((
                            "{} Updates {}, num timesteps {}, FPS {}, Env FPS "
                            "{}, \n Last {} training episodes: mean/median reward "
                            "{:.3f}/{:.3f}, min/max reward {:.3f}/{:.3f}\n"
                        ).format(
                            datetime.datetime.now(),
                            iter_count,
                            total_num_steps,
                            fps,
                            int(1.0 / env_spf),
                            len(episode_rewards),
                            np.mean(episode_rewards),
                            np.median(episode_rewards),
                            np.min(episode_rewards),
                            np.max(episode_rewards),
                        ))

                        if self.shell_args.tensorboard:
                            log_dict.update({
                                "stats/full_spf":
                                1.0 / (fps + 1e-10),
                                "stats/env_spf":
                                env_spf,
                                "stats/forward_spf":
                                forward_spf,
                                "stats/backward_spf":
                                backward_spf,
                                "stats/full_fps":
                                fps,
                                "stats/env_fps":
                                1.0 / (env_spf + 1e-10),
                                "stats/forward_fps":
                                1.0 / (forward_spf + 1e-10),
                                "stats/backward_fps":
                                1.0 / (backward_spf + 1e-10),
                                "episode/mean_rewards":
                                np.mean(episode_rewards),
                                "episode/median_rewards":
                                np.median(episode_rewards),
                                "episode/min_rewards":
                                np.min(episode_rewards),
                                "episode/max_rewards":
                                np.max(episode_rewards),
                                "episode/mean_lengths":
                                np.mean(episode_lengths),
                                "episode/median_lengths":
                                np.median(episode_lengths),
                                "episode/min_lengths":
                                np.min(episode_lengths),
                                "episode/max_lengths":
                                np.max(episode_lengths),
                            })
                        fps_timer[0] = time.time()
                        fps_timer[1] = total_num_steps
                        timers[:] = 0
                    if self.shell_args.tensorboard:
                        log_dict.update({
                            "loss/action":
                            action_loss,
                            "loss/0_total":
                            total_loss,
                            "loss/visual/0_total":
                            visual_loss_total,
                            "loss/exploration/egomotion":
                            egomotion_loss,
                            "loss/exploration/forward_model":
                            forward_model_loss,
                        })
                        if self.shell_args.algo != "supervised":
                            log_dict.update({
                                "loss/entropy": dist_entropy,
                                "loss/value": value_loss
                            })
                        for key, val in visual_loss_dict.items():
                            log_dict["loss/visual/" + key] = val
                        self.logger.dict_log(log_dict, step=total_num_steps)

                if self.shell_args.eval_interval is not None and total_num_steps % self.shell_args.eval_interval < (
                        self.shell_args.num_processes *
                        self.shell_args.num_forward_rollout_steps):
                    self.save_checkpoint(-1, total_num_steps)
                    self.set_log_iter(total_num_steps)
                    self.evaluate_model()
                    # reset the env datasets
                    self.envs.unwrapped.call(
                        ["switch_dataset"] * self.shell_args.num_processes,
                        [("train", )] * self.shell_args.num_processes)
                    obs = self.envs.reset()
                    if self.compute_surface_normals:
                        obs["surface_normals"] = pt_util.depth_to_surface_normals(
                            obs["depth"].to(self.device))
                    obs["prev_action_one_hot"] = obs[
                        "prev_action_one_hot"][:,
                                               ACTION_SPACE].to(torch.float32)
                    if self.shell_args.algo == "supervised":
                        obs["best_next_action"] = pt_util.from_numpy(
                            obs["best_next_action"][:, ACTION_SPACE])
                    self.rollouts.copy_obs(obs, 0)
                    distances = pt_util.to_numpy_array(
                        obs["goal_geodesic_distance"])
                    self.train_stats["start_geodesic_distance"][:] = distances
                    previous_visual_features = None
                    egomotion_pred = None
                    prev_action = None
                    prev_action_probs = None
        except:
            # Catch all exceptions so a final save can be performed
            import traceback

            traceback.print_exc()
        finally:
            self.save_checkpoint(-1, total_num_steps)
Пример #5
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         True)

    frame_skip = 4  # frame skip
    if args.tb_dir[-1] != '/':
        args.tb_dir = args.tb_dir + '/'
    logger = Logger(args.tb_dir)
    logger.write_settings(args)
    if args.use_tdm:

        # beta scheduler
        if args.beta_schedule == 'const':
            beta_func = lambda x: float(args.beta_int)
        elif args.beta_schedule == 'sqrt':
            beta_func = lambda x: 1. / np.sqrt(x + 2)
        elif args.beta_schedule == 'log':
            beta_func = lambda x: 1. / np.log(x + 2)
        elif args.beta_schedule == 'linear':
            beta_func = lambda x: 1. / (x + 2)

        # bonus function variations
        if args.bonus_func == 'linear':
            bonus_func = lambda x: x + 1
        elif args.bonus_func == 'square':
            bonus_func = lambda x: (x + 1)**2
        elif args.bonus_func == 'sqrt':
            bonus_func = lambda x: (x + 1)**(1 / 2)
        elif args.bonus_func == 'log':
            bonus_func = lambda x: np.log(x + 1)

        # temporal difference module
        tdm = TemporalDifferenceModule(
            inputSize=2 * int(envs.observation_space.shape[0]),
            outputSize=args.time_intervals,
            num_fc_layers=int(args.num_layers),
            depth_fc_layers=int(args.fc_width),
            lr=float(args.opt_lr),
            buffer_max_length=args.buffer_max_length,
            buffer_RL_ratio=args.buffer_RL_ratio,
            frame_skip=frame_skip,
            tdm_epoch=args.tdm_epoch,
            tdm_batchsize=args.tdm_batchsize,
            logger=logger,
            bonus_func=bonus_func).to(device)

        #collect random trajectories
        sample_collector = CollectSamples(envs,
                                          args.num_processes,
                                          initial=True)
        tdm.buffer_rand = sample_collector.collect_trajectories(
            args.num_rollouts, args.steps_per_rollout)

        # initial training
        tdm.update()

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    episode_rewards = deque(maxlen=10)
    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        # acting
        for step in range(args.num_steps):

            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            # envs.render()

            obs_old = obs.clone()
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            #compute intrinsic bonus
            if args.use_tdm:
                tdm.symm_eval = True if step == args.num_steps - 1 else False
                reward_int = tdm.compute_bonus(obs_old, obs).float()
                reward += beta_func(
                    step + j * args.num_steps) * reward_int.cpu().unsqueeze(1)

                if (j % args.log_interval == 0) and (step
                                                     == args.num_steps - 1):
                    logger.add_reward_intrinsic(reward_int,
                                                (j + 1) * args.num_steps *
                                                args.num_processes)

            #saving to buffer.
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        # saving to buffer and periodic updating parameters
        if (args.use_tdm):
            tdm.buffer_RL_temp.append((rollouts.obs, rollouts.masks))
            if (j % args.num_steps == 0 and j > 0):
                tdm.update()

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        # no
        # save every 1-million steps
        if (((j + 1) * args.num_steps * args.num_processes) % 1e6 == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            if j == num_updates - 1:
                save_here = os.path.join(
                    save_path, args.env_name + "_step_{}M.pt".format(
                        (j + 1) * args.num_steps * args.num_processes // 1e6))
            else:
                save_here = os.path.join(save_path,
                                         args.env_name + "_final.pt")
            torch.save(save_model, save_here)  # saved policy.

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        # printing outputs
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            logger.add_reward(episode_rewards,
                              (j + 1) * args.num_steps * args.num_processes)

        #
        # if j % args.tb_interval == 0:
        #     # mean/std or median/1stqt?
        #     logger.add_tdm_loss(loss, self.epoch_count*i)

        # evaluation process
        # if (args.eval_interval is not None
        #         and len(episode_rewards) > 1
        #         and j % args.eval_interval == 0):
        #     eval_envs = make_vec_envs(
        #         args.env_name, args.seed + args.num_processes, args.num_processes,
        #         args.gamma, eval_log_dir, args.add_timestep, device, True)
        #
        #     vec_norm = get_vec_normalize(eval_envs)
        #     if vec_norm is not None:
        #         vec_norm.eval()
        #         vec_norm.ob_rms = get_vec_normalize(envs).ob_rms
        #
        #     eval_episode_rewards = []
        #
        #     obs = eval_envs.reset()
        #     eval_recurrent_hidden_states = torch.zeros(args.num_processes,
        #                     actor_critic.recurrent_hidden_state_size, device=device)
        #     eval_masks = torch.zeros(args.num_processes, 1, device=device)
        #
        #     while len(eval_episode_rewards) < 10:
        #         with torch.no_grad():
        #             _, action, _, eval_recurrent_hidden_states = actor_critic.act(
        #                 obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)
        #
        #         # Obser reward and next obs
        #         # envs.render()
        #         obs, reward, done, infos = eval_envs.step(action)
        #
        #         eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
        #                                         for done_ in done])
        #         for info in infos:
        #             if 'episode' in info.keys():
        #                 eval_episode_rewards.append(info['episode']['r'])
        #
        #     eval_envs.close()
        #
        #     print(" Evaluation using {} episodes: mean reward {:.5f}\n".
        #         format(len(eval_episode_rewards),
        #                np.mean(eval_episode_rewards)))

        # # plotting
        # if args.vis and j % args.vis_interval == 0:
        #     try:
        #         # Sometimes monitor doesn't properly flush the outputs
        #         win = visdom_plot(viz, win, args.log_dir, args.env_name,
        #                           args.algo, args.num_env_steps)
        #     except IOError:
        #         pass
    #if done save:::::::::::
    logger.save()
Пример #6
0
    avg_win_rate = deque(maxlen=10)
    avg_cubes_placed_total = deque(maxlen=10)
    avg_player_dist_to_ref = deque(maxlen=10)
    avg_opponent_dist_to_ref = deque(maxlen=10)
    opponnet_policies = deque(maxlen=10)

    cached_stats = None

start = time.time()
num_updates = int(args.num_env_steps) // args.num_steps // args.num_processes
for j in range(num_updates):

    if args.use_linear_lr_decay:
        # decrease learning rate linearly
        utils.update_linear_schedule(
            agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr
        )

    for step in range(args.num_steps):
        # Sample actions
        with torch.no_grad():
            value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                rollouts.obs[step],
                rollouts.recurrent_hidden_states[step],
                rollouts.masks[step],
                deterministic=args.det,
            )

        # Observe reward and next obs
        obs, reward, done, infos = envs.step(action)
        for idx, info in enumerate(infos):
Пример #7
0
def main():
    if not os.path.exists("./plots"):
        os.makedirs("./plots")

    gbench = read_gbench('./data/gbench.txt')

    args = my_get_args()
    print(args)

    config = dict(sigma=args.sim_sigma,
                  momentum=args.sim_momentum,
                  pump_bins=args.sim_bins,
                  lag=1000 // args.num_steps,
                  rshift=args.sim_rshift,
                  pump_scale=args.sim_scale,
                  reward_kind=args.sim_reward,
                  continuous=args.sim_continuous,
                  span=args.sim_span,
                  percentile=args.sim_percentile,
                  last_runs=args.sim_perc_len,
                  add_linear=not args.sim_no_linear,
                  start_pump=args.sim_start,
                  static_features=not args.sim_no_static,
                  extra_features=not args.sim_no_extra,
                  curiosity_num=args.curiosity)

    base_kwargs = {
        'hidden_size': args.hidden_size,
        'film_size': 800 * (not args.sim_no_static)
    }
    if args.relu:
        base_kwargs['activation'] = 'relu'
    base = FILMBase  #FILMBase

    if args.gset > 0:
        test_graphs = [args.gset]
    else:
        test_graphs = [1, 2, 3, 4, 5]

    #---------------------------------------------------------

    assert args.algo in ['a2c', 'ppo', 'acktr']
    if args.recurrent_policy:
        assert args.algo in ['a2c', 'ppo'
                             ], 'Recurrent policy is not implemented for ACKTR'

    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print('Num updates: ', num_updates)

    if args.dry_run:
        return

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    logdata = defaultdict(list)

    if args.gset > 0:
        envs = []
        for g in test_graphs:
            g_ = read_gset('./data/G{}.txt'.format(g), negate=True)
            s = SIMCIM(g_,
                       device=device,
                       batch_size=args.num_processes,
                       **config)
            s.runpump()
            envs.append(s)
        envs = SIMCollection(envs, [gbench[g] for g in test_graphs])
        logdata['bls_bench'] = [gbench[g] for g in test_graphs]
    else:
        envs = SIMGeneratorRandom(800,
                                  0.06,
                                  args.num_processes,
                                  config,
                                  keep=args.sim_keep,
                                  n_sims=args.sim_nsim,
                                  device=device)

    if args.snapshot is None:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              base=base,
                              base_kwargs=base_kwargs)
    else:
        actor_critic, _ = torch.load(
            os.path.join(args.save_dir, args.algo, args.snapshot + ".pt"))

    actor_critic.to(device)
    print(actor_critic)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()

    print(rollouts.obs.shape, obs.shape)

    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    eval_envs = []
    for g in test_graphs:
        g_ = read_gset('./data/G{}.txt'.format(g), negate=True)
        s = SIMCIM(g_,
                   device=device,
                   batch_size=args.num_val_processes,
                   **config)
        s.runpump()
        eval_envs.append(s)
    eval_envs = SIMCollection(eval_envs, [gbench[g] for g in test_graphs])
    ref_cuts = [s.lastcuts for s in eval_envs.envs]
    logdata['ref_cuts'] = [e.tolist() for e in ref_cuts]

    stoch_cuts = None

    start = time.time()
    for j in range(num_updates):
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            if args.algo == "acktr":
                # use optimizer's learning rate since it's hard-coded in kfac.py
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       agent.optimizer.lr)
            else:
                update_linear_schedule(agent.optimizer, j, num_updates,
                                       args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        # ROLLOUT DATA
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            if 'episode' in infos[0].keys():
                rw = np.mean([e['episode']['r'] for e in infos])
                logdata['episode_rewards'].append(rw.item())
                if args.gset > 0:
                    cuts = [e.lastcuts for e in envs.envs]
                    logdata['train_median'].append(
                        [np.median(e).item() for e in cuts])
                    logdata['train_max'].append(
                        [np.max(e).item() for e in cuts])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        #UPDATE AGENT
        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, _ = agent.update(rollouts)
        logdata['alosses'].append(action_loss)
        logdata['vlosses'].append(value_loss)

        logdata['train_percentiles'].append(envs.perc.tolist())

        rollouts.after_update()

        #CHECKPOINTS
        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(
                save_model,
                os.path.join(save_path, args.env_name + '-' + str(j) + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        #LOGGING
        if j % args.log_interval == 0 and len(logdata['episode_rewards']) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: \
                mean/median reward {:.3f}/{:.3f}, min/max reward {:.3f}/{:.3f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(logdata['episode_rewards']),
                        np.mean(logdata['episode_rewards'][-10:]),
                        np.median(logdata['episode_rewards'][-10:]),
                        np.min(logdata['episode_rewards'][-10:]),
                        np.max(logdata['episode_rewards'][-10:])))

        #EVALUATION
        if (args.eval_interval is not None and j % args.eval_interval == 0):
            logdata['spumps'] = []

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_val_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_val_processes, 1, device=device)

            eval_done = False

            while not eval_done:
                p = eval_envs.envs[0].old_p
                logdata['spumps'].append(p[:10].cpu().numpy().tolist())

                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=False)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_done = np.all(done)

                eval_masks = torch.tensor([[0.0] if done_ else [1.0]
                                           for done_ in done],
                                          dtype=torch.float32,
                                          device=device)

            stoch_cuts = [e.lastcuts for e in eval_envs.envs]
            logdata['stoch_cuts'] = [e.tolist() for e in stoch_cuts]
            logdata['eval_median'].append(
                [np.median(e).item() for e in stoch_cuts])
            logdata['eval_max'].append([np.max(e).item() for e in stoch_cuts])

            logdata['test_percentiles'].append(eval_envs.perc.tolist())

            rw = np.mean([e['episode']['r'] for e in infos])
            logdata['eval_episode_rewards'].append(rw.item())

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(logdata['eval_episode_rewards']),
                np.mean(logdata['eval_episode_rewards'])))

        if j % args.log_interval == 0:
            fn = os.path.join(save_path, args.env_name + ".res")
            with open(fn, 'w') as f:
                json.dump(logdata, f, sort_keys=True, indent=2)

        #VISUALIZATION
        if j % args.vis_interval == 0:
            #if False:
            plt.figure(figsize=(15, 10))

            plt.subplot(231)
            plt.title('Rewards')
            plt.xlabel('SIM runs')
            plt.plot(logdata['episode_rewards'], c='r', label='mean train')
            plt.plot(np.linspace(0, len(logdata['episode_rewards']),
                                 len(logdata['eval_episode_rewards'])),
                     logdata['eval_episode_rewards'],
                     'b',
                     label='mean eval')
            plt.legend()

            plt.subplot(232)
            plt.plot(logdata['alosses'])
            plt.title('Policy loss')

            plt.subplot(233)
            plt.plot(logdata['vlosses'])
            plt.title('Value loss')

            plt.subplot(234)
            plt.title('Pumps')
            plt.xlabel('SIM iterations / 10')
            plt.plot(np.array(logdata['spumps']))
            plt.ylim(-0.05, 1.1)

            plt.subplot(235)
            plt.plot(logdata['train_percentiles'])
            plt.title('Train average percentile')

            plt.subplot(236)
            plt.title('Test percentiles')
            plt.plot(logdata['test_percentiles'])
            plt.legend([str(e) for e in test_graphs])

            plt.tight_layout()
            plt.savefig('./plots/agent_' + args.env_name + '.pdf')
            plt.clf()
            plt.close()
            gc.collect()
            #plt.show()

            if stoch_cuts is not None:
                fig, axs = plt.subplots(len(ref_cuts),
                                        1,
                                        sharex=False,
                                        tight_layout=True)
                if len(ref_cuts) == 1:
                    axs = [axs]
                for gi in range(len(ref_cuts)):
                    mn = min(ref_cuts[gi])
                    axs[gi].hist(ref_cuts[gi], bins=100, alpha=0.7)
                    dc = stoch_cuts[gi][stoch_cuts[gi] >= mn]
                    if dc.size > 0:
                        axs[gi].hist(dc, bins=100, alpha=0.7)
                plt.savefig('./plots/cuts_' + args.env_name + '.pdf')
                plt.clf()
                plt.close()
                gc.collect()
Пример #8
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:" + str(args.cuda_id) if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(file_name,
                                            num_trajectories=4,
                                            subsample_frequency=20)
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)

    ########## file related
    filename = args.env_name + "_" + args.algo + "_n" + str(args.max_episodes)
    if args.attack:
        filename += "_" + args.type + "_" + args.aim
        filename += "_s" + str(args.stepsize) + "_m" + str(
            args.maxiter) + "_r" + str(args.radius) + "_f" + str(args.frac)
    if args.run >= 0:
        filename += "_run" + str(args.run)

    logger = get_log(args.logdir + filename + "_" + current_time)
    logger.info(args)

    rew_file = open(args.resdir + filename + ".txt", "w")

    if args.compute:
        radius_file = open(
            args.resdir + filename + "_radius" + "_s" + str(args.stepsize) +
            "_m" + str(args.maxiter) + "_th" + str(args.dist_thres) + ".txt",
            "w")
    if args.type == "targ" or args.type == "fgsm":
        targ_file = open(args.resdir + filename + "_targ.txt", "w")

    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    if args.type == "wb":
        attack_net = WbAttacker(agent,
                                envs,
                                int(args.frac * num_updates),
                                num_updates,
                                args,
                                device=device)
    if args.type == "bb":
        attack_net = BbAttacker(agent,
                                envs,
                                int(args.frac * num_updates),
                                num_updates,
                                args,
                                device=device)
    elif args.type == "rand":
        attack_net = RandAttacker(envs,
                                  radius=args.radius,
                                  frac=args.frac,
                                  maxat=int(args.frac * num_updates),
                                  device=device)
    elif args.type == "semirand":
        attack_net = WbAttacker(agent,
                                envs,
                                int(args.frac * num_updates),
                                num_updates,
                                args,
                                device,
                                rand_select=True)
    elif args.type == "targ":
        if isinstance(envs.action_space, Discrete):
            action_dim = envs.action_space.n
            target_policy = action_dim - 1
        elif isinstance(envs.action_space, Box):
            action_dim = envs.action_space.shape[0]
            target_policy = torch.zeros(action_dim)
#            target_policy[-1] = 1
        print("target policy is", target_policy)
        attack_net = TargAttacker(agent,
                                  envs,
                                  int(args.frac * num_updates),
                                  num_updates,
                                  target_policy,
                                  args,
                                  device=device)
    elif args.type == "fgsm":
        if isinstance(envs.action_space, Discrete):
            action_dim = envs.action_space.n
            target_policy = action_dim - 1
        elif isinstance(envs.action_space, Box):
            action_dim = envs.action_space.shape[0]
            target_policy = torch.zeros(action_dim)

        def targ_policy(obs):
            return target_policy

        attack_net = FGSMAttacker(envs,
                                  agent,
                                  targ_policy,
                                  radius=args.radius,
                                  frac=args.frac,
                                  maxat=int(args.frac * num_updates),
                                  device=device)
#    if args.aim == "obs" or aim == "hybrid":
#        obs_space = gym.make(args.env_name).observation_space
#        attack_net.set_obs_range(obs_space.low, obs_space.high)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    episode = 0

    start = time.time()

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            if args.type == "fgsm":
                #                print("before", rollouts.obs[step])
                rollouts.obs[step] = attack_net.attack(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step]).clone()
#                print("after", rollouts.obs[step])
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
            if args.type == "targ" or args.type == "fgsm":
                if isinstance(envs.action_space, Discrete):
                    num_target = (
                        action == target_policy).nonzero()[:, 0].size()[0]
                    targ_file.write(
                        str(num_target / args.num_processes) + "\n")
                    print("percentage of target:",
                          num_target / args.num_processes)
                elif isinstance(envs.action_space, Box):
                    target_action = target_policy.repeat(action.size()[0], 1)
                    targ_file.write(
                        str(
                            torch.norm(action - target_action).item() /
                            args.num_processes) + "\n")
                    print("percentage of target:",
                          torch.sum(action).item() / args.num_processes)
            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action.cpu())
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    #                    rew_file.write("episode: {}, total reward: {}\n".format(episode, info['episode']['r']))
                    episode += 1

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        if args.attack and args.type != "fgsm":
            if args.aim == "reward":
                logger.info(rollouts.rewards.flatten())
                rollouts.rewards = attack_net.attack_r_general(
                    rollouts, next_value).clone().detach()
                logger.info("after attack")
                logger.info(rollouts.rewards.flatten())
            elif args.aim == "obs":
                origin = rollouts.obs.clone()
                rollouts.obs = attack_net.attack_s_general(
                    rollouts, next_value).clone().detach()
                logger.info(origin)
                logger.info("after")
                logger.info(rollouts.obs)
            elif args.aim == "action":
                origin = torch.flatten(rollouts.actions).clone()
                rollouts.actions = attack_net.attack_a_general(
                    rollouts, next_value).clone().detach()
                logger.info("attack value")
                logger.info(torch.flatten(rollouts.actions) - origin)
            elif args.aim == "hybrid":
                res_aim, attack = attack_net.attack_hybrid(
                    rollouts, next_value, args.radius_s, args.radius_a,
                    args.radius_r)
                print("attack ", res_aim)
                if res_aim == "obs":
                    origin = rollouts.obs.clone()
                    rollouts.obs = attack.clone().detach()
                    logger.info(origin)
                    logger.info("attack obs")
                    logger.info(rollouts.obs)
                elif res_aim == "action":
                    origin = torch.flatten(rollouts.actions).clone()
                    rollouts.actions = attack.clone().detach()
                    logger.info("attack action")
                    logger.info(torch.flatten(rollouts.actions) - origin)
                elif res_aim == "reward":
                    logger.info(rollouts.rewards.flatten())
                    rollouts.rewards = attack.clone().detach()
                    logger.info("attack reward")
                    logger.info(rollouts.rewards.flatten())
        if args.compute:
            stable_radius = attack_net.compute_radius(rollouts, next_value)
            print("stable radius:", stable_radius)
            radius_file.write("update: {}, radius: {}\n".format(
                j, np.round(stable_radius, decimals=3)))
        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        if args.attack and args.type == "bb":
            attack_net.learning(rollouts)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) >= 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            rew_file.write("updates: {}, mean reward: {}\n".format(
                j, np.mean(episode_rewards)))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)


#        if episode > args.max_episodes:
#            print("reach episodes limit")
#            break

    if args.attack:
        logger.info("total attacks: {}\n".format(attack_net.attack_num))
        print("total attacks: {}\n".format(attack_net.attack_num))

    rew_file.close()
    if args.compute:
        radius_file.close()
    if args.type == "targ" or args.type == "fgsm":
        targ_file.close()
Пример #9
0
def learn(env, max_timesteps, timesteps_per_batch, clip_param):
    ppo_epoch = 5
    num_step = timesteps_per_batch
    save_interval = 100
    seed = 1000
    batch_size = 64

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    log_dir = os.path.expanduser('/tmp/gym/')
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda")

    envs = make_vec_envs(env, seed, 8, 0.95, log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': False})
    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     clip_param,
                     ppo_epoch,
                     batch_size,
                     0.5,
                     0.01,
                     lr=0.00025,
                     eps=1e-05,
                     max_grad_norm=0.5)

    rollouts = RolloutStorage(num_step, 8, envs.observation_space.shape,
                              envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(torch.tensor(obs))
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(max_timesteps) // num_step // 8
    for j in range(num_updates):

        # decrease learning rate linearly
        utils.update_linear_schedule(agent.optimizer, j, num_updates, 0.00025)

        for step in range(num_step):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, True, 0.99, 0.95, False)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % save_interval == 0
                or j == num_updates - 1) and "./trained_models/" != "":
            save_path = os.path.join("./trained_models/", 'ppo')
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, 'UniversalPolicy' + ".pt"))

        if j % 1 == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * 8 * num_step
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
    '''
Пример #10
0
def main():
    args = get_args()

    # set seeds and devices
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    log_dir = utils.default_log_init(args.log_dir, args.env_name)
    save_dir = utils.default_save_init(log_dir, args.save_dir)
    args_file = utils.default_args_init(log_dir, args)

    threads_dir = log_dir + "threads/"
    os.makedirs(threads_dir)
    logger.configure(log_dir)

    print(log_dir)

    eval_log_dir = log_dir + "_eval"
    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, threads_dir, device, False)
    action_sample = envs.action_space.sample()

    def init_alg():
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              base_kwargs={
                                  'recurrent': args.recurrent_policy
                              }).to(device)
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm,
                         weight_decay=args.weight_decay)
        rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                  envs.observation_space.shape,
                                  envs.action_space,
                                  actor_critic.recurrent_hidden_state_size)
        return actor_critic, agent, rollouts

    actor_critic, agent, rollouts = init_alg()
    print(actor_critic)

    print(actor_critic.num_params)

    # init observations and rollouts
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    # init train loggers(not useful for the actual training, but for analysis)
    episode_rewards = deque(maxlen=args.average_over)
    start_time = time.time()
    abs_start = start_time
    min_rewards = []
    max_rewards = []
    mean_rewards = []
    median_rewards = []
    nr_episodes = []
    times = []
    num_total_steps = []
    log_dict = {
        "min_rewards": min_rewards,
        "max_rewards": max_rewards,
        "mean_rewards": mean_rewards,
        "median_rewards": median_rewards,
        "nr_episodes": nr_episodes,
        "times": times,
        "num_total_steps": num_total_steps
    }

    # init convergence checks and other useful variables
    best_avg = -1e6
    best_med = -1e6
    since_improve = 0
    solved = 0
    epochs = int(args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(1, epochs + 1):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, epochs,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            if type(action_sample) is int:
                obs, reward, done, infos = envs.step(action.squeeze())
            else:
                obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end_time = time.time()
            s_total = end_time - abs_start
            print(
                "Updates(epochs) {}, num timesteps {}, elapsed {:01}:{:02}:{:02.2f} epoch seconds {} \n Last {} training episodes: "
                "mean/median reward {:.1f}/{:.1f},min/max reward {:.1f}/{:.1f}\n "
                .format(j, total_num_steps, int(s_total // 3600),
                        int(s_total % 3600 // 60),
                        s_total % 60, end_time - start_time,
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss),
                flush=True)
            min_rewards.append(np.min(episode_rewards))
            max_rewards.append(np.max(episode_rewards))
            mean_rewards.append(np.mean(episode_rewards))
            median_rewards.append(np.median(episode_rewards))
            nr_episodes.append(total_num_steps)
            times.append(end_time - start_time)
            num_total_steps.append(total_num_steps)
            start_time = end_time

        if (j % args.save_interval == 0 or j == epochs - 1) and save_dir != "":
            save_path = "{}it{}_val{:.1f}.pth".format(save_dir, j,
                                                      np.mean(episode_rewards))
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], save_path)
            print("-------Saved at path {}-------\n".format(save_path))
            with open(save_dir + "it_{}_log.json".format(j), "w") as file:
                json.dump(log_dict, file)

        if args.convergence_its != 0:
            worse = True
            if best_avg < np.mean(episode_rewards):
                best_avg = np.mean(episode_rewards)
                since_improve = 0
                worse = False
            if best_med < np.median(episode_rewards):
                best_med = np.median(episode_rewards)
                since_improve = 0
                worse = False

            if worse:
                since_improve += 1
                if since_improve > args.convergence_its:
                    print(
                        "No improvements in {} iterations, best average is {}, best median is {}, stopping training"
                        .format(since_improve, best_avg, best_med))
                    save_path = "{}it{}_val{:.1f}_c.pth".format(
                        save_dir, j, np.mean(episode_rewards))
                    print("Saved final model at {}".format(save_path))
                    torch.save([
                        actor_critic,
                        getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
                    ], save_path)
                    return

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
def main(base=IAMBase, num_frame_stack=None):

    seed = 1
    env_name = "Warehouse-v0"
    num_processes = 32
    log_dir = './logs/'
    eval_interval = None
    log_interval = 10
    use_linear_lr_decay = False
    use_proper_time_limits = False
    save_dir = './trained_models/'
    use_cuda = True

    # PPO
    gamma = 0.99  # reward discount factor
    clip_param = 0.1  #0.2
    ppo_epoch = 3  #4
    num_mini_batch = 32
    value_loss_coef = 1  #0.5
    entropy_coef = 0.01
    lr = 2.5e-4  #7e-4
    eps = 1e-5
    max_grad_norm = float('inf')
    use_gae = True
    gae_lambda = 0.95
    num_steps = 8  #5

    # Store
    num_env_steps = 4e6
    save_interval = 100

    # IAM
    dset = [
        0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
        66, 67, 68, 69, 70, 71, 72
    ]

    #gym.envs.register(env_name, entry_point="environments.warehouse.warehouse:Warehouse",
    #                    kwargs={'seed': seed, 'parameters': {"num_frames": 1}})

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    log_dir = os.path.expanduser(log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if use_cuda else "cpu")

    envs = make_vec_envs(env_name,
                         seed,
                         num_processes,
                         gamma,
                         log_dir,
                         device,
                         False,
                         num_frame_stack=num_frame_stack)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base=base,
                          base_kwargs=({
                              'dset': dset
                          } if base == IAMBase else {}))
    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     clip_param,
                     ppo_epoch,
                     num_mini_batch,
                     value_loss_coef,
                     entropy_coef,
                     lr=lr,
                     eps=eps,
                     max_grad_norm=max_grad_norm)

    rollouts = RolloutStorage(num_steps, num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(num_env_steps) // num_steps // num_processes
    for j in range(num_updates):

        if use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(agent.optimizer, j, num_updates, lr)

        for step in range(num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, use_gae, gamma, gae_lambda,
                                 use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % save_interval == 0 or j == num_updates - 1) and save_dir != "":
            save_path = os.path.join(save_dir, 'PPO')
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'obs_rms', None)
            ], os.path.join(save_path, env_name + ".pt"))

        if j % log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * num_processes * num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (eval_interval is not None and len(episode_rewards) > 1
                and j % eval_interval == 0):
            obs_rms = utils.get_vec_normalize(envs).obs_rms
            evaluate(actor_critic, obs_rms, env_name, seed, num_processes,
                     eval_log_dir, device)
Пример #12
0
def train_loop(agent,
               envs,
               env_name,
               num_updates,
               num_steps,
               curiosity_module=None,
               save_interval=None,
               eval_interval=None,
               log_interval=None,
               time_limit=1000,
               curiosity_rew_after=0,
               curiosity_rew_before=None,
               use_linear_lr_decay=True,
               lr_decay_horizon=None,
               callbacks=None):
    # Create rollout storage
    num_processes = envs.num_envs
    rollouts = RolloutStorage(num_steps, num_processes,
                              envs.observation_space.shape, envs.action_space,
                              agent.actor_critic.recurrent_hidden_state_size)
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    device = next(agent.actor_critic.parameters()).device
    rollouts.to(device)

    # Create curiosity statistics saver
    if curiosity_module is not None:
        curiosity_stats = CuriosityStatistics(num_processes=num_processes,
                                              time_limit=time_limit)
    else:
        curiosity_stats = None
    # Create train statistics saver
    stats = TrainStatistics(log_interval,
                            with_curiosity=(curiosity_module is not None))

    # Possibility to use curiosity only for a few epochs
    if curiosity_rew_before is None:
        curiosity_rew_before = num_updates

    # Train loop
    start = time.time()
    for j in range(num_updates):
        if use_linear_lr_decay:
            if lr_decay_horizon is None:
                lr_decay_horizon = num_updates
            utils.update_linear_schedule(agent.optimizer, j, lr_decay_horizon,
                                         agent.optimizer.defaults['lr'])

        curiosity_loss = 0
        for step in range(num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = agent.actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Observe extrinsic reward and next obs
            obs, reward, done, infos = envs.step(action)

            # Compute intrinsic rewards
            if curiosity_module is not None:
                time_limit_mask = np.array([
                    0 if 'bad_transition' in info.keys() else 1
                    for info in infos
                ])
                if use_proper_time_limits:
                    curiosity_done = done * time_limit_mask
                else:
                    curiosity_done = done
                curiosity_reward = curiosity_module.get_reward(
                    rollouts.obs[step], action, obs, curiosity_done)
                curiosity_loss += curiosity_module.update(
                    rollouts.obs[step], action, obs, curiosity_done)

            # Update current reward statistics
            stats.update_extrinsic_reward(infos)
            if curiosity_module is not None:
                curiosity_stats.update(curiosity_reward.cpu().numpy().ravel(),
                                       done)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])

            if (curiosity_module is not None) and (j >= curiosity_rew_after and
                                                   j <= curiosity_rew_before):
                reward = reward + curiosity_reward
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = agent.actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, use_gae, gamma, gae_lambda,
                                 use_proper_time_limits)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        if curiosity_module is not None:
            curiosity_loss /= num_steps
        else:
            curiosity_loss = None
        stats.update_losses(value_loss, action_loss, dist_entropy,
                            curiosity_loss)

        # Save for every interval-th episode or for the last epoch
        if (j % save_interval == 0 or j == num_updates - 1) and save_dir != "":
            save_path = os.path.join(save_dir, "ppo")
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                agent.actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, env_name + ".pt"))

        if j % eval_interval == 0:
            try:
                ob_rms = utils.get_vec_normalize(envs).ob_rms
            except:
                ob_rms = None
            stats.eval_episode_rewards.append(
                evaluate(agent.actor_critic, env_name, device, ob_rms))

        if j % log_interval == 0 and len(stats.episode_rewards) > 1:
            total_num_steps = (j + 1) * num_processes * num_steps
            end = time.time()
            stats.update_log(total_num_steps, curiosity_stats)
            if callbacks is not None:
                for callback in callbacks:
                    callback(stats,
                             agent,
                             n_updates=j,
                             total_n_steps=total_num_steps,
                             fps=int(total_num_steps / (end - start)))

    return stats
Пример #13
0
    def run(self):
        args = self.args
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        print("CUDA is available: ", torch.cuda.is_available())
        if args.cuda:
            print("CUDA enabled")
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
        else:
            if args.cuda_deterministic:
                print("Warning CUDA is requested but is not available")
            else:
                print("CUDA disabled")

        log_dir = os.path.expanduser(args.log_dir)
        eval_log_dir = log_dir + "_eval"
        utils.cleanup_log_dir(log_dir)
        utils.cleanup_log_dir(eval_log_dir)
        print("get_num_thread", torch.get_num_threads())

        device = torch.device("cuda:0" if args.cuda else "cpu")

        envs = make_vec_envs(args.env_name, self.config_parameters, args.seed,
                             args.num_processes, args.gamma, args.log_dir,
                             device, False)

        actor_critic = create_IAM_model(envs, args, self.config_parameters)
        actor_critic.to(device)

        if args.algo == 'a2c':
            agent = algo.A2C_ACKTR(actor_critic,
                                   args.value_loss_coef,
                                   args.entropy_coef,
                                   lr=args.lr,
                                   eps=args.eps,
                                   alpha=args.alpha,
                                   max_grad_norm=args.max_grad_norm)
        # This algorithm should be used for the reproduction project.
        elif args.algo == 'ppo':
            agent = algo.PPO(actor_critic,
                             args.clip_param,
                             args.ppo_epoch,
                             args.num_mini_batch,
                             args.value_loss_coef,
                             args.entropy_coef,
                             lr=args.lr,
                             eps=args.eps,
                             max_grad_norm=args.max_grad_norm)
        elif args.algo == 'acktr':
            agent = algo.A2C_ACKTR(actor_critic,
                                   args.value_loss_coef,
                                   args.entropy_coef,
                                   acktr=True)

        if args.gail:
            assert len(envs.observation_space.shape) == 1
            discr = gail.Discriminator(
                envs.observation_space.shape[0] + envs.action_space.shape[0],
                100, device)
            file_name = os.path.join(
                args.gail_experts_dir,
                "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

            expert_dataset = gail.ExpertDataset(file_name,
                                                num_trajectories=4,
                                                subsample_frequency=20)
            drop_last = len(expert_dataset) > args.gail_batch_size
            gail_train_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset,
                batch_size=args.gail_batch_size,
                shuffle=True,
                drop_last=drop_last)

        rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                  envs.observation_space.shape,
                                  envs.action_space,
                                  actor_critic.recurrent_hidden_state_size)

        obs = envs.reset()
        rollouts.obs[0].copy_(obs)
        rollouts.to(device)
        # Always return the average of the last 100 steps. This means the average is sampled.
        episode_rewards = deque(maxlen=100)

        start = time.time()
        num_updates = int(
            args.num_env_steps) // args.num_steps // args.num_processes
        for j in range(num_updates):

            if args.use_linear_lr_decay:
                # decrease learning rate linearly
                utils.update_linear_schedule(
                    agent.optimizer, j, num_updates,
                    agent.optimizer.lr if args.algo == "acktr" else args.lr)

            for step in range(args.num_steps):
                # Sample actions
                with torch.no_grad():
                    value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

                # Obser reward and next obs
                obs, reward, done, infos = envs.step(action)

                for info in infos:
                    if 'episode' in info.keys():
                        episode_rewards.append(info['episode']['r'])

                # If done then clean the history of observations.
                masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                           for done_ in done])
                bad_masks = torch.FloatTensor(
                    [[0.0] if 'bad_transition' in info.keys() else [1.0]
                     for info in infos])
                rollouts.insert(obs, recurrent_hidden_states, action,
                                action_log_prob, value, reward, masks,
                                bad_masks)

            with torch.no_grad():
                next_value = actor_critic.get_value(
                    rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                    rollouts.masks[-1]).detach()

            if args.gail:
                if j >= 10:
                    envs.venv.eval()

                gail_epoch = args.gail_epoch
                if j < 10:
                    gail_epoch = 100  # Warm up
                for _ in range(gail_epoch):
                    discr.update(gail_train_loader, rollouts,
                                 utils.get_vec_normalize(envs)._obfilt)

                for step in range(args.num_steps):
                    rollouts.rewards[step] = discr.predict_reward(
                        rollouts.obs[step], rollouts.actions[step], args.gamma,
                        rollouts.masks[step])

            rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                     args.gae_lambda,
                                     args.use_proper_time_limits)

            value_loss, action_loss, dist_entropy = agent.update(rollouts)

            rollouts.after_update()

            # save for every interval-th episode or for the last epoch
            if (j % args.save_interval == 0
                    or j == num_updates - 1) and args.save_dir != "":
                save_path = os.path.join(args.save_dir, args.algo)
                try:
                    os.makedirs(save_path)
                except OSError:
                    pass

                torch.save([
                    actor_critic,
                    getattr(utils.get_vec_normalize(envs), 'obs_rms', None)
                ], os.path.join(save_path, self.model_file_name))

            if j % args.log_interval == 0 and len(episode_rewards) > 1:
                total_num_steps = (j + 1) * args.num_processes * args.num_steps
                end = time.time()
                elapsed_time = end - start
                data = [
                    j,  # Updates
                    total_num_steps,  # timesteps
                    int(total_num_steps / elapsed_time),  # FPS
                    len(episode_rewards),  # Only useful for print statement
                    np.mean(episode_rewards),  # mean of rewards
                    np.median(episode_rewards),  # median of rewards
                    np.min(episode_rewards),  # min rewards
                    np.max(episode_rewards),  # max rewards
                    dist_entropy,
                    value_loss,
                    action_loss,
                    elapsed_time
                ]
                output = ''.join([str(x) + ',' for x in data])
                self.data_saver.append(output)
                print(
                    f"Updates {data[0]}, num timesteps {data[1]}, FPS {data[2]}, elapsed time {int(data[11])} sec. Last {data[3]} training episodes: mean/median reward {data[4]:.2f}/{data[5]:.2f}, min/max reward {data[6]:.1f}/{data[7]:.1f}",
                    end="\r")

            if (args.eval_interval is not None and len(episode_rewards) > 1
                    and j % args.eval_interval == 0):
                obs_rms = utils.get_vec_normalize(envs).obs_rms
                evaluate(actor_critic, obs_rms, args.env_name, args.seed,
                         args.num_processes, eval_log_dir, device)
Пример #14
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    receipts = StorageReceipt()
    make_env = lambda tasks: MiniWoBGraphEnvironment(
        base_url=os.environ.get("BASE_URL", f"file://{MINIWOB_HTML}/"),
        levels=tasks,
        level_tracker=LevelTracker(tasks),
        wait_ms=500,
    )

    task = args.env_name
    if args.env_name == "PongNoFrameskip-v4":
        args.env_name = "clickbutton"
        task = "miniwob/click-button.html"
    if task == "levels":
        tasks = MINIWOB_CHALLENGES
    else:
        tasks = [[task]]
    print("Selected tasks:", tasks)
    NUM_ACTIONS = 1
    envs = make_vec_envs(
        [make_env(tasks[i % len(tasks)]) for i in range(args.num_processes)],
        receipts)

    if os.path.exists("./datadir/autoencoder.pt"):
        dom_autoencoder = torch.load("./datadir/autoencoder.pt")
        dom_encoder = dom_autoencoder.encoder
        for param in dom_encoder.parameters():
            param.requires_grad = False
    else:
        print("No dom encoder")
        dom_encoder = None
    actor_critic = Policy(
        envs.observation_space.shape,
        gym.spaces.Discrete(NUM_ACTIONS),  # envs.action_space,
        base=GNNBase,
        base_kwargs={
            "dom_encoder": dom_encoder,
            "recurrent": args.recurrent_policy
        },
    )
    actor_critic.dist = NodeObjective()
    actor_critic.to(device)

    if args.algo == "a2c":
        agent = algo.A2C_ACKTR(
            actor_critic,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            alpha=args.alpha,
            max_grad_norm=args.max_grad_norm,
        )
    elif args.algo == "ppo":
        agent = algo.PPO(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
        )
    elif args.algo == "acktr":
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(envs.observation_space.shape[0], 100,
                                   device)

        rr = ReplayRepository("/code/miniwob-plusplus-demos/*turk/*")
        ds = rr.get_dataset()
        print("GAIL Replay Dataset", ds)
        gail_train_loader = torch_geometric.data.DataLoader(
            ds, batch_size=args.gail_batch_size, shuffle=True, drop_last=True)

    from tensorboardX import SummaryWriter
    import datetime

    ts_str = datetime.datetime.fromtimestamp(
        time.time()).strftime("%Y-%m-%d_%H-%M-%S")
    tensorboard_writer = SummaryWriter(
        log_dir=os.path.join("/tmp/log", ts_str))

    rollouts = ReceiptRolloutStorage(
        args.num_steps,
        args.num_processes,
        (1, ),  # envs.observation_space.shape,
        envs.action_space,
        actor_critic.recurrent_hidden_state_size,
        receipts,
    )

    # resume from last save
    if args.save_dir != "":
        save_path = os.path.join(args.save_dir, args.algo)
        try:
            os.makedirs(save_path)
        except OSError:
            pass

        model_path = os.path.join(save_path, args.env_name + ".pt")
        if False and os.path.exists(model_path):
            print("Loadng previous model:", model_path)
            actor_critic = torch.load(model_path)
            actor_critic.train()

    obs = envs.reset()
    rollouts.obs[0].copy_(torch.tensor(obs))
    rollouts.to(device)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print("Iterations:", num_updates, args.num_steps)
    for j in range(num_updates):
        episode_rewards = deque(maxlen=args.num_steps * args.num_processes)
        if j and last_action_time + 5 < time.time():
            # task likely timed out
            print("Reseting tasks")
            obs = envs.reset()
            rollouts.obs[0].copy_(torch.tensor(obs))
            rollouts.recurrent_hidden_states[0].copy_(
                torch.zeros_like(rollouts.recurrent_hidden_states[0]))
            rollouts.masks[0].copy_(torch.zeros_like(rollouts.masks[0]))

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer,
                j,
                num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr,
            )

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    receipts.redeem(rollouts.obs[step]),
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step],
                )

            # Obser reward and next obs
            last_action_time = time.time()
            obs, reward, done, infos = envs.step(action)

            for e, i in enumerate(infos):
                if i.get("real_action") is not None:
                    action[e] = i["real_action"]
                if i.get("bad_transition"):
                    action[e] = torch.zeros_like(action[e])

            for info in infos:
                if "episode" in info.keys():
                    episode_rewards.append(info["episode"]["r"])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if "bad_transition" in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(
                torch.tensor(obs),
                recurrent_hidden_states,
                action,
                action_log_prob,
                value,
                torch.tensor(reward).unsqueeze(1),
                masks,
                bad_masks,
            )

        with torch.no_grad():
            next_value = actor_critic.get_value(
                receipts.redeem(rollouts.obs[-1]),
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1],
            ).detach()

        if args.gail:
            # if j >= 10:
            #    envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                obsfilt = lambda x, update: x  # utils.get_vec_normalize(envs)._obfilt
                gl = discr.update(gail_train_loader, rollouts, obsfilt)
            print("Gail loss:", gl)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    receipts.redeem(rollouts.obs[step]),
                    rollouts.actions[step],
                    args.gamma,
                    rollouts.masks[step],
                )

        rollouts.compute_returns(
            next_value,
            args.use_gae,
            args.gamma,
            args.gae_lambda,
            args.use_proper_time_limits,
        )

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        obs_shape = rollouts.obs.size()[2:]
        obs = rollouts.obs[:-1].view(-1, *obs_shape)
        obs = obs[torch.randint(0, obs.size(0), (1, 32))]

        rollouts.after_update()

        receipts.prune(rollouts.obs)

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            model_path = os.path.join(save_path, args.env_name + ".pt")
            torch.save(actor_critic, model_path)
            print("Saved model:", model_path)

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(
                    j,
                    total_num_steps,
                    int(total_num_steps / (end - start)),
                    len(episode_rewards),
                    np.mean(episode_rewards),
                    np.median(episode_rewards),
                    np.min(episode_rewards),
                    np.max(episode_rewards),
                    dist_entropy,
                    value_loss,
                    action_loss,
                ))

            from pprint import pprint

            pprint(LevelTracker.global_scoreboard)

            # tensorboard_writer.add_histogram(
            #    "task_ranks", torch.tensor(predictor._difficulty_rank), total_num_steps
            # )
            tensorboard_writer.add_histogram("value", value, total_num_steps)
            tensorboard_writer.add_histogram("x", actor_critic.base.last_x,
                                             total_num_steps)
            tensorboard_writer.add_histogram("query",
                                             actor_critic.base.last_query,
                                             total_num_steps)
            tensorboard_writer.add_histogram("inputs_at",
                                             actor_critic.base.last_inputs_at,
                                             total_num_steps)

            tensorboard_writer.add_scalar("mean_reward",
                                          np.mean(episode_rewards),
                                          total_num_steps)
            tensorboard_writer.add_scalar("median_reward",
                                          np.median(episode_rewards),
                                          total_num_steps)
            tensorboard_writer.add_scalar("min_reward",
                                          np.min(episode_rewards),
                                          total_num_steps)
            tensorboard_writer.add_scalar("max_reward",
                                          np.max(episode_rewards),
                                          total_num_steps)
            tensorboard_writer.add_scalar("dist_entropy", dist_entropy,
                                          total_num_steps)
            tensorboard_writer.add_scalar("value_loss", value_loss,
                                          total_num_steps)
            tensorboard_writer.add_scalar("action_loss", action_loss,
                                          total_num_steps)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(
                actor_critic,
                ob_rms,
                args.env_name,
                args.seed,
                args.num_processes,
                eval_log_dir,
                device,
            )
Пример #15
0
def main():
    args = get_args()
    trace_size = args.trace_size
    toke = tokenizer()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    tobs = torch.zeros((args.num_processes, trace_size), dtype=torch.long)
    #print (tobs.dtype)
    rollouts.obs[0].copy_(obs)
    rollouts.tobs[0].copy_(tobs)

    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):

            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.tobs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            tobs = []
            envs.render()
            for info in infos:
                if 'episode' in info.keys():
                    #print ("episode ", info['episode'])
                    episode_rewards.append(info['episode']['r'])
                trace = info['trace'][0:trace_size]
                trace = [x[2] for x in trace]
                word_to_ix = toke.tokenize(trace)
                seq = prepare_sequence(trace, word_to_ix)
                if len(seq) < trace_size:
                    seq = torch.zeros((trace_size), dtype=torch.long)
                seq = seq[:trace_size]
                #print (seq.dtype)
                tobs.append(seq)
            tobs = torch.stack(tobs)
            #print (tobs)
            #print (tobs.size())
            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, tobs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.tobs[-1],
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Пример #16
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    args_dir, logs_dir, models_dir, samples_dir = get_all_save_paths(
        args, 'pretrain', combine_action=args.combine_action)
    eval_log_dir = logs_dir + "_eval"
    utils.cleanup_log_dir(logs_dir)
    utils.cleanup_log_dir(eval_log_dir)

    _, _, intrinsic_models_dir, _ = get_all_save_paths(args,
                                                       'learn_reward',
                                                       load_only=True)
    if args.load_iter != 'final':
        intrinsic_model_file_name = os.path.join(
            intrinsic_models_dir,
            args.env_name + '_{}.pt'.format(args.load_iter))
    else:
        intrinsic_model_file_name = os.path.join(
            intrinsic_models_dir, args.env_name + '.pt'.format(args.load_iter))
    intrinsic_arg_file_name = os.path.join(args_dir, 'command.txt')

    # save args to arg_file
    with open(intrinsic_arg_file_name, 'w') as f:
        json.dump(args.__dict__, f, indent=2)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, logs_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)
    else:
        raise NotImplementedError

    if args.use_intrinsic:
        obs_shape = envs.observation_space.shape
        if len(obs_shape) == 3:
            action_dim = envs.action_space.n
        elif len(obs_shape) == 1:
            action_dim = envs.action_space.shape[0]

        if 'NoFrameskip' in args.env_name:
            file_name = os.path.join(
                args.experts_dir, "trajs_ppo_{}.pt".format(
                    args.env_name.split('-')[0].replace('NoFrameskip',
                                                        '').lower()))
        else:
            file_name = os.path.join(
                args.experts_dir,
                "trajs_ppo_{}.pt".format(args.env_name.split('-')[0].lower()))

        rff = RewardForwardFilter(args.gamma)
        intrinsic_rms = RunningMeanStd(shape=())

        if args.intrinsic_module == 'icm':
            print('Loading pretrained intrinsic module: %s' %
                  intrinsic_model_file_name)
            inverse_model, forward_dynamics_model, encoder = torch.load(
                intrinsic_model_file_name)
            icm =  IntrinsicCuriosityModule(envs, device, inverse_model, forward_dynamics_model, \
                                            inverse_lr=args.intrinsic_lr, forward_lr=args.intrinsic_lr,\
                                            )

        if args.intrinsic_module == 'vae':
            print('Loading pretrained intrinsic module: %s' %
                  intrinsic_model_file_name)
            vae = torch.load(intrinsic_model_file_name)
            icm =  GenerativeIntrinsicRewardModule(envs, device, \
                                                   vae, lr=args.intrinsic_lr, \
                                                   )

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            obs, reward, done, infos = envs.step(action)
            next_obs = obs

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, next_obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.use_intrinsic:
            for step in range(args.num_steps):
                state = rollouts.obs[step]
                action = rollouts.actions[step]
                next_state = rollouts.next_obs[step]
                if args.intrinsic_module == 'icm':
                    state = encoder(state)
                    next_state = encoder(next_state)
                with torch.no_grad():
                    rollouts.rewards[
                        step], pred_next_state = icm.calculate_intrinsic_reward(
                            state, action, next_state, args.lambda_true_action)
            if args.standardize == 'True':
                buf_rews = rollouts.rewards.cpu().numpy()
                intrinsic_rffs = np.array(
                    [rff.update(rew) for rew in buf_rews.T])
                rffs_mean, rffs_std, rffs_count = mpi_moments(
                    intrinsic_rffs.ravel())
                intrinsic_rms.update_from_moments(rffs_mean, rffs_std**2,
                                                  rffs_count)
                mean = intrinsic_rms.mean
                std = np.asarray(np.sqrt(intrinsic_rms.var))
                rollouts.rewards = rollouts.rewards / torch.from_numpy(std).to(
                    device)

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(models_dir, args.algo)
            policy_file_name = os.path.join(save_path, args.env_name + '.pt')

            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], policy_file_name)

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "{} Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(args.env_name, j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Пример #17
0
def main():
    args = get_args()
    use_ppo = args.algo == 'ppo'
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy,
                     'share_parameter': args.share_parameter})
    actor_critic.to(device)

    return_distributions = False
    if args.algo == 'ppo':
        agent = algo.PPO(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo_rb':
        agent = algo.PPO_RB(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            args.rb_alpha,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm
        )
    elif args.algo == 'tr_ppo':
        agent = algo.TR_PPO(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
            ppo_clip_param=args.ppo_clip_param
        )
        return_distributions = True
    elif args.algo == 'tr_ppo_rb':
        agent = algo.TR_PPO_RB(
            actor_critic,
            args.clip_param,
            args.ppo_epoch,
            args.num_mini_batch,
            args.value_loss_coef,
            args.entropy_coef,
            args.rb_alpha,
            lr=args.lr,
            eps=args.eps,
            max_grad_norm=args.max_grad_norm,
            ppo_clip_param=args.ppo_clip_param
        )
        return_distributions = True

    if not return_distributions:
        rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                  envs.observation_space.shape, envs.action_space,
                                  actor_critic.recurrent_hidden_state_size)
    else:
        if actor_critic.dist_name == 'DiagGaussian':
            rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                      envs.observation_space.shape, envs.action_space,
                                      actor_critic.recurrent_hidden_state_size,
                                      distribution_param_dim=envs.action_space.shape[0]*2
                                      )
        elif actor_critic.dist_name == 'Bernoulli' or actor_critic.dist_name == 'Categorical':
            rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                      envs.observation_space.shape, envs.action_space,
                                      actor_critic.recurrent_hidden_state_size,
                                      distribution_param_dim=envs.action_space.n
                                      )
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    prev_mean_reward = None
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states, parameters = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step], return_distribution=True)


            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])

            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks, parameters)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts, use_ppo)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        mean_rewards = np.mean(episode_rewards)
        if (prev_mean_reward is not None) and (mean_rewards < prev_mean_reward) and \
           (use_ppo == False) and args.revert_to_ppo and j > 3:
            use_ppo = True
            print('Revert Back to PPO Training')
            # args.lr = 3e-4
        prev_mean_reward = mean_rewards
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Пример #18
0
def main():
    chrono = exp.chrono()

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(args.repeat):
        with chrono.time('train') as t:
            for n in range(args.number):

                if args.use_linear_lr_decay:
                    utils.update_linear_schedule(
                        agent.optimizer, j, num_updates, agent.optimizer.lr
                        if args.algo == "acktr" else args.lr)

                for step in range(args.num_steps):
                    # Sample actions
                    with torch.no_grad():
                        value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                            rollouts.obs[step],
                            rollouts.recurrent_hidden_states[step],
                            rollouts.masks[step])

                    # Obser reward and next obs
                    obs, reward, done, infos = envs.step(action)

                    for info in infos:
                        if 'episode' in info.keys():
                            episode_rewards.append(info['episode']['r'])

                    # If done then clean the history of observations.
                    masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                               for done_ in done])
                    bad_masks = torch.FloatTensor(
                        [[0.0] if 'bad_transition' in info.keys() else [1.0]
                         for info in infos])

                    rollouts.insert(obs, recurrent_hidden_states, action,
                                    action_log_prob, value, reward, masks,
                                    bad_masks)

                with torch.no_grad():
                    next_value = actor_critic.get_value(
                        rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                        rollouts.masks[-1]).detach()
                # ---
                rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                         args.gae_lambda,
                                         args.use_proper_time_limits)

                value_loss, action_loss, dist_entropy = agent.update(rollouts)

                exp.log_batch_loss(action_loss)
                exp.log_metric('value_loss', value_loss)

                rollouts.after_update()

                total_num_steps = (j + 1) * args.num_processes * args.num_steps

                if j % args.log_interval == 0 and len(episode_rewards) > 1:
                    end = time.time()
                    print(
                        "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                        .format(j, total_num_steps,
                                int(total_num_steps / (end - start)),
                                len(episode_rewards), np.mean(episode_rewards),
                                np.median(episode_rewards),
                                np.min(episode_rewards),
                                np.max(episode_rewards), dist_entropy,
                                value_loss, action_loss))

            # -- number
        # -- chrono
        exp.show_eta(j, t)
    # -- epoch
    exp.report()
    envs.close()
Пример #19
0
def main():
    args = get_args()

    # Record trajectories
    if args.record_trajectories:
        record_trajectories()
        return

    print(args)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    # Append the model name
    log_dir = os.path.expanduser(args.log_dir)
    log_dir = os.path.join(log_dir, args.model_name, str(args.seed))

    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, log_dir, device, False)

    # Take activation for carracing
    print("Loaded env...")
    activation = None
    if args.env_name == 'CarRacing-v0' and args.use_activation:
        activation = torch.tanh
    print(activation)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy,
                              'env': args.env_name
                          },
                          activation=activation)
    actor_critic.to(device)
    # Load from previous model
    if args.load_model_name:
        state = torch.load(
            os.path.join(args.save_dir, args.load_model_name,
                         args.load_model_name + '_{}.pt'.format(args.seed)))[0]
        try:
            actor_critic.load_state_dict(state)
        except:
            actor_critic = state

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        if len(envs.observation_space.shape) == 1:
            discr = gail.Discriminator(
                envs.observation_space.shape[0] + envs.action_space.shape[0],
                100, device)
            file_name = os.path.join(
                args.gail_experts_dir,
                "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

            expert_dataset = gail.ExpertDataset(file_name,
                                                num_trajectories=3,
                                                subsample_frequency=1)
            expert_dataset_test = gail.ExpertDataset(file_name,
                                                     num_trajectories=1,
                                                     start=3,
                                                     subsample_frequency=1)
            drop_last = len(expert_dataset) > args.gail_batch_size
            gail_train_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset,
                batch_size=args.gail_batch_size,
                shuffle=True,
                drop_last=drop_last)
            gail_test_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset_test,
                batch_size=args.gail_batch_size,
                shuffle=False,
                drop_last=False)
            print(len(expert_dataset), len(expert_dataset_test))
        else:
            # env observation shape is 3 => its an image
            assert len(envs.observation_space.shape) == 3
            discr = gail.CNNDiscriminator(envs.observation_space.shape,
                                          envs.action_space, 100, device)
            file_name = os.path.join(args.gail_experts_dir, 'expert_data.pkl')

            expert_dataset = gail.ExpertImageDataset(file_name, train=True)
            test_dataset = gail.ExpertImageDataset(file_name, train=False)
            gail_train_loader = torch.utils.data.DataLoader(
                dataset=expert_dataset,
                batch_size=args.gail_batch_size,
                shuffle=True,
                drop_last=len(expert_dataset) > args.gail_batch_size,
            )
            gail_test_loader = torch.utils.data.DataLoader(
                dataset=test_dataset,
                batch_size=args.gail_batch_size,
                shuffle=False,
                drop_last=len(test_dataset) > args.gail_batch_size,
            )
            print('Dataloader size', len(gail_train_loader))

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    start = time.time()
    #num_updates = int(
    #args.num_env_steps) // args.num_steps // args.num_processes
    num_updates = args.num_steps
    print(num_updates)

    # count the number of times validation loss increases
    val_loss_increase = 0
    prev_val_action = np.inf
    best_val_loss = np.inf

    for j in range(num_updates):
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(action)
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                try:
                    envs.venv.eval()
                except:
                    pass

            gail_epoch = args.gail_epoch
            #if j < 10:
            #gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                #discr.update(gail_train_loader, rollouts,
                #None)
                pass

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        #value_loss, action_loss, dist_entropy = agent.update(rollouts)
        value_loss = 0
        dist_entropy = 0
        for data in gail_train_loader:
            expert_states, expert_actions = data
            expert_states = Variable(expert_states).to(device)
            expert_actions = Variable(expert_actions).to(device)
            loss = agent.update_bc(expert_states, expert_actions)
            action_loss = loss.data.cpu().numpy()
        print("Epoch: {}, Loss: {}".format(j, action_loss))

        with torch.no_grad():
            cnt = 0
            val_action_loss = 0
            for data in gail_test_loader:
                expert_states, expert_actions = data
                expert_states = Variable(expert_states).to(device)
                expert_actions = Variable(expert_actions).to(device)
                loss = agent.get_action_loss(expert_states, expert_actions)
                val_action_loss += loss.data.cpu().numpy()
                cnt += 1
            val_action_loss /= cnt
            print("Val Loss: {}".format(val_action_loss))

        #rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":

            if val_action_loss < best_val_loss:
                val_loss_increase = 0
                best_val_loss = val_action_loss
                save_path = os.path.join(args.save_dir, args.model_name)
                try:
                    os.makedirs(save_path)
                except OSError:
                    pass

                torch.save([
                    actor_critic.state_dict(),
                    getattr(utils.get_vec_normalize(envs), 'ob_rms', None),
                    getattr(utils.get_vec_normalize(envs), 'ret_rms', None)
                ],
                           os.path.join(
                               save_path,
                               args.model_name + "_{}.pt".format(args.seed)))
            elif val_action_loss > prev_val_action:
                val_loss_increase += 1
                if val_loss_increase == 10:
                    print("Val loss increasing too much, breaking here...")
                    break
            elif val_action_loss < prev_val_action:
                val_loss_increase = 0

            # Update prev val action
            prev_val_action = val_action_loss

        # log interval
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Пример #20
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    # coinrun environments need to be treated differently.
    coinrun_envs = {
        'CoinRun': 'standard',
        'CoinRun-Platforms': 'platform',
        'Random-Mazes': 'maze'
    }

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         device,
                         False,
                         coin_run_level=args.num_levels,
                         difficulty=args.high_difficulty,
                         coin_run_seed=args.seed)
    if args.env_name in coinrun_envs.keys():
        observation_space_shape = (3, 64, 64)
        args.save_dir = args.save_dir + "/NUM_LEVELS_{}".format(
            args.num_levels)  # Save the level info in the

    else:
        observation_space_shape = envs.observation_space.shape

    # trained model name
    if args.continue_ppo_training:
        actor_critic, _ = torch.load(os.path.join(args.check_point,
                                                  args.env_name + ".pt"),
                                     map_location=torch.device(device))
    elif args.cor_gail:
        embed_size = args.embed_size
        actor_critic = Policy(observation_space_shape,
                              envs.action_space,
                              hidden_size=args.hidden_size,
                              embed_size=embed_size,
                              base_kwargs={'recurrent': args.recurrent_policy})
        actor_critic.to(device)
        correlator = Correlator(observation_space_shape,
                                envs.action_space,
                                hidden_dim=args.hidden_size,
                                embed_dim=embed_size,
                                lr=args.lr,
                                device=device)

        correlator.to(device)
        embeds = torch.zeros(1, embed_size)
    else:
        embed_size = 0
        actor_critic = Policy(observation_space_shape,
                              envs.action_space,
                              hidden_size=args.hidden_size,
                              base_kwargs={'recurrent': args.recurrent_policy})
        actor_critic.to(device)
        embeds = None

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm,
                         use_clipped_value_loss=True,
                         ftrl_mode=args.cor_gail or args.no_regret_gail,
                         correlated_mode=args.cor_gail)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail or args.no_regret_gail or args.cor_gail:
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        expert_dataset = gail.ExpertDataset(
            file_name, num_trajectories=50,
            subsample_frequency=1)  #if subsample set to a different number,
        # grad_pen might need adjustment
        drop_last = len(expert_dataset) > args.gail_batch_size
        gail_train_loader = torch.utils.data.DataLoader(
            dataset=expert_dataset,
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=drop_last)
        if args.gail:
            discr = gail.Discriminator(observation_space_shape,
                                       envs.action_space,
                                       device=device)
        if args.no_regret_gail or args.cor_gail:
            queue = deque(
                maxlen=args.queue_size
            )  # Strategy Queues: Each element of a queue is a dicr strategy
            agent_queue = deque(
                maxlen=args.queue_size
            )  # Strategy Queues: Each element of a queue is an agent strategy
            pruning_frequency = 1
        if args.no_regret_gail:
            discr = regret_gail.NoRegretDiscriminator(observation_space_shape,
                                                      envs.action_space,
                                                      device=device)
        if args.cor_gail:
            discr = cor_gail.CorDiscriminator(observation_space_shape,
                                              envs.action_space,
                                              hidden_size=args.hidden_size,
                                              embed_size=embed_size,
                                              device=device)
        discr.to(device)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              observation_space_shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size,
                              embed_size)

    obs = envs.reset()

    rollouts.obs[0].copy_(obs)
    if args.cor_gail:
        rollouts.embeds[0].copy_(embeds)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions # Roll-out
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step], rollouts.embeds[step])

            obs, reward, done, infos = envs.step(action.to('cpu'))
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)
            # Sample mediating/correlating actions # Correlated Roll-out
            if args.cor_gail:
                embeds, embeds_log_prob, mean = correlator.act(
                    rollouts.obs[step], rollouts.actions[step])
                rollouts.insert_embedding(embeds, embeds_log_prob)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1], rollouts.embeds[-1]).detach()

        if args.gail or args.no_regret_gail or args.cor_gail:
            if args.env_name not in {'CoinRun', 'Random-Mazes'}:
                if j >= 10:
                    envs.venv.eval()

            gail_epoch = args.gail_epoch
            if args.gail:
                if j < 10:
                    gail_epoch = 100  # Warm up

                # no need for gail epoch or warm up in the no-regret case and cor_gail.
            for _ in range(gail_epoch):
                if utils.get_vec_normalize(envs):
                    obfilt = utils.get_vec_normalize(envs)._obfilt
                else:
                    obfilt = None

                if args.gail:
                    discr.update(gail_train_loader, rollouts, obfilt)

                if args.no_regret_gail or args.cor_gail:
                    last_strategy = discr.update(gail_train_loader, rollouts,
                                                 queue, args.max_grad_norm,
                                                 obfilt, j)

            for step in range(args.num_steps):
                if args.gail:
                    rollouts.rewards[step] = discr.predict_reward(
                        rollouts.obs[step], rollouts.actions[step], args.gamma,
                        rollouts.masks[step])
                if args.no_regret_gail:
                    rollouts.rewards[step] = discr.predict_reward(
                        rollouts.obs[step], rollouts.actions[step], args.gamma,
                        rollouts.masks[step], queue)
                if args.cor_gail:
                    rollouts.rewards[
                        step], correlator_reward = discr.predict_reward(
                            rollouts.obs[step], rollouts.actions[step],
                            rollouts.embeds[step], args.gamma,
                            rollouts.masks[step], queue)

                    rollouts.correlated_reward[step] = correlator_reward

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        if args.gail:
            value_loss, action_loss, dist_entropy = agent.update(rollouts, j)

        elif args.no_regret_gail or args.cor_gail:
            value_loss, action_loss, dist_entropy, agent_gains, agent_strategy = \
                agent.mixed_update(rollouts, agent_queue, j)

        if args.cor_gail:
            correlator.update(rollouts, agent_gains, args.max_grad_norm)

        if args.no_regret_gail or args.cor_gail:
            queue, _ = utils.queue_update(queue, pruning_frequency,
                                          args.queue_size, j, last_strategy)
            agent_queue, pruning_frequency = utils.queue_update(
                agent_queue, pruning_frequency, args.queue_size, j,
                agent_strategy)

        rollouts.after_update()
        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            if not args.cor_gail:
                torch.save([
                    actor_critic,
                    getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
                ], os.path.join(save_path, args.env_name + ".pt"))

            else:
                print("saving models in {}".format(
                    os.path.join(save_path, args.env_name)))
                torch.save(
                    correlator.state_dict(),
                    os.path.join(save_path, args.env_name + "correlator.pt"))
                torch.save([
                    actor_critic.state_dict(),
                    getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
                ], os.path.join(save_path, args.env_name + "actor.pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f},"
                " value loss/action loss {:.1f}/{}".format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    len(episode_rewards), np.mean(episode_rewards),
                    np.median(episode_rewards), value_loss, action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Пример #21
0
def train():
    processes = []
    if os.path.isdir(args.log_dir):
        ans = input('{} exists\ncontinue and overwrite? y/n: '.format(args.log_dir))
        if ans == 'n':
            return

    logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv'])
    logger.log(args)
    json.dump(vars(args), open(os.path.join(args.log_dir, 'params.json'), 'w'))

    torch.set_num_threads(2)

    start = time.time()
    policy_update_time, policy_forward_time = 0, 0
    step_time_env, step_time_total, step_time_rewarder = 0, 0, 0
    visualize_time = 0
    rewarder_fit_time = 0

    envs = RL2EnvInterface(args)
    if args.look:
        looker = Looker(args.log_dir)

    actor_critic = Policy(envs.obs_shape, envs.action_space,
                          base=RL2Base, base_kwargs={'recurrent': True,
                                                     'num_act_dim': envs.action_space.shape[0]})
    actor_critic.to(args.device)
    agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                     args.value_loss_coef, args.entropy_coef, lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.obs_shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size)
    rollouts.to(args.device)

    def copy_obs_into_beginning_of_storage(obs):
        obs_raw, obs_act, obs_rew, obs_flag = obs
        rollouts.obs[0].copy_(obs_raw)
        rollouts.obs_act[0].copy_(obs_act)
        rollouts.obs_rew[0].copy_(obs_rew)
        rollouts.obs_flag[0].copy_(obs_flag)

    for j in range(args.num_updates):
        obs = envs.reset()
        copy_obs_into_beginning_of_storage(obs)

        if args.use_linear_lr_decay:
            update_linear_schedule(agent.optimizer, j, args.num_updates, args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param  * (1 - j / float(args.num_updates))

        episode_returns = [0 for i in range(args.trial_length)]
        episode_final_reward = [0 for i in range(args.trial_length)]
        i_episode = 0

        log_marginal = 0
        lambda_log_s_given_z = 0

        for step in range(args.num_steps):
            # Sample actions
            policy_forward_start = time.time()
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.get_obs(step),
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])
            policy_forward_time += time.time() - policy_forward_start

            # Obser reward and next obs
            step_total_start = time.time()
            obs, reward, done, info = envs.step(action)
            step_time_total += time.time() - step_total_start
            step_time_env += info['step_time_env']
            step_time_rewarder += info['reward_time']
            log_marginal += info['log_marginal'].sum().item()
            lambda_log_s_given_z += info['lambda_log_s_given_z'].sum().item()

            episode_returns[i_episode] += reward.sum().item()
            if all(done['episode']):
                episode_final_reward[i_episode] += reward.sum().item()
                i_episode = (i_episode + 1) % args.trial_length

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done['trial']])
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)

        assert all(done['trial'])

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.get_obs(-1),
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        policy_update_start = time.time()
        if args.rewarder != 'supervised' and envs.rewarder.fit_counter == 0 and not args.vae_load:
            value_loss, action_loss, dist_entropy = 0, 0, 0
        else:
            value_loss, action_loss, dist_entropy = agent.update(rollouts)
        policy_update_time += time.time() - policy_update_start
        rollouts.after_update()

        # metrics
        trajectories_pre = envs.trajectories_pre_current_update
        state_entropy_pre = calculate_state_entropy(args, trajectories_pre)

        trajectories_post = envs.trajectories_post_current_update
        state_entropy_post = calculate_state_entropy(args, trajectories_post)

        return_avg = rollouts.rewards.sum() / args.trials_per_update
        reward_avg = return_avg / (args.trial_length * args.episode_length)
        log_marginal_avg = log_marginal / args.trials_per_update / (args.trial_length * args.episode_length)
        lambda_log_s_given_z_avg = lambda_log_s_given_z / args.trials_per_update / (args.trial_length * args.episode_length)

        num_steps = (j + 1) * args.num_steps * args.num_processes
        num_episodes = num_steps // args.episode_length
        num_trials = num_episodes // args.trial_length

        logger.logkv('state_entropy_pre', state_entropy_pre)
        logger.logkv('state_entropy_post', state_entropy_post)
        logger.logkv('value_loss', value_loss)
        logger.logkv('action_loss', action_loss)
        logger.logkv('dist_entropy', dist_entropy)
        logger.logkv('return_avg', return_avg.item())
        logger.logkv('reward_avg', reward_avg.item())
        logger.logkv('steps', (j + 1) * args.num_steps * args.num_processes)
        logger.logkv('episodes', num_episodes)
        logger.logkv('trials', num_trials)
        logger.logkv('policy_updates', (j + 1))
        logger.logkv('time', time.time() - start)
        logger.logkv('policy_forward_time', policy_forward_time)
        logger.logkv('policy_update_time', policy_update_time)
        logger.logkv('step_time_rewarder', step_time_rewarder)
        logger.logkv('step_time_env', step_time_env)
        logger.logkv('step_time_total', step_time_total)
        logger.logkv('visualize_time', visualize_time)
        logger.logkv('rewarder_fit_time', rewarder_fit_time)
        logger.logkv('log_marginal_avg', log_marginal_avg)
        logger.logkv('lambda_log_s_given_z_avg', lambda_log_s_given_z_avg)
        for i_episode in range(args.trial_length):
            logger.logkv('episode_return_avg_{}'.format(i_episode),
                         episode_returns[i_episode] / args.trials_per_update)
            logger.logkv('episode_final_reward_{}'.format(i_episode),
                         episode_final_reward[i_episode] / args.trials_per_update)

        if (j % args.save_period == 0 or j == args.num_updates - 1) and args.log_dir != '':
            save_model(args, actor_critic, envs, iteration=j)

        if not args.vae_freeze and j % args.rewarder_fit_period == 0:
            rewarder_fit_start = time.time()
            envs.fit_rewarder()
            rewarder_fit_time += time.time() - rewarder_fit_start

        if (j % args.vis_period == 0 or j == args.num_updates - 1) and args.log_dir != '':
            visualize_start = time.time()
            if args.look:
                eval_return_avg, eval_episode_returns, eval_episode_final_reward = looker.look(iteration=j)
                logger.logkv('eval_return_avg', eval_return_avg)
                for i_episode in range(args.trial_length):
                    logger.logkv('eval_episode_return_avg_{}'.format(i_episode),
                                 eval_episode_returns[i_episode] / args.trials_per_update)
                    logger.logkv('eval_episode_final_reward_{}'.format(i_episode),
                                 eval_episode_final_reward[i_episode] / args.trials_per_update)

            if args.plot:
                p = Popen('python visualize.py --log-dir {}'.format(args.log_dir), shell=True)
                processes.append(p)
            visualize_time += time.time() - visualize_start

        logger.dumpkvs()
Пример #22
0
def main():

    realEval = True  #False

    gettrace = getattr(sys, 'gettrace', None)

    parser = argparse.ArgumentParser(description='RL')
    parser.add_argument('--action-type',
                        type=int,
                        default=-1,
                        help='action type to play (default: -1)')

    parser.add_argument('--tasks-difficulty-from',
                        type=int,
                        default=0,
                        help='tasks_difficulty_from')

    parser.add_argument('--tasks-difficulty-to',
                        type=int,
                        default=100000,
                        help='tasks-difficulty-to')

    parser.add_argument('--verboseLevel',
                        type=int,
                        default=5,
                        help='verboseLevel')

    parser.add_argument('--filesNamesSuffix',
                        default="",
                        help='filesNamesSuffix')

    parser.add_argument('--nobest-exit',
                        type=int,
                        default=10000,
                        help='nobest_exit')

    args = get_args(parser)

    args.algo = 'ppo'
    args.env_name = 'QuadruppedWalk-v1'  #'RoboschoolAnt-v1' #'QuadruppedWalk-v1' #'RoboschoolAnt-v1' #'QuadruppedWalk-v1'
    args.use_gae = True
    args.num_steps = 2048
    #args.num_processes = 4
    args.num_processes = 4
    if gettrace():
        args.num_processes = 1
    args.lr = 0.0001
    args.entropy_coef = 0.0
    args.value_loss_coef = 0.5
    args.ppo_epoch = 4
    args.num_mini_batch = 256
    args.gamma = 0.99
    args.gae_lambda = 0.95
    args.clip_param = 0.2
    args.use_linear_lr_decay = True  #True #True #True #True
    args.use_proper_time_limits = True
    args.save_dir = "./trained_models/" + args.env_name + "/"
    args.load_dir = "./trained_models/" + args.env_name + "/"
    args.log_dir = "./logs/robot"
    if gettrace():
        args.save_dir = "./trained_models/" + args.env_name + "debug/"
        args.load_dir = "./trained_models/" + args.env_name + "debug/"
        args.log_dir = "./logs/robot_d"
    args.log_interval = 30
    args.hidden_size = 64
    args.last_hidden_size = args.hidden_size
    args.recurrent_policy = False  #True
    args.save_interval = 20
    #args.seed = 1
    reward_shaping = 0.01
    allowMutate = False

    if args.seed == -1:
        args.seed = time.clock_gettime_ns(time.CLOCK_REALTIME)

    quadruppedEnv.settings.tasks_difficulty_from = args.tasks_difficulty_from
    quadruppedEnv.settings.tasks_difficulty_to = args.tasks_difficulty_to

    # 0 is a walk
    # 1 is a balance
    # 2 multitasks
    # 3 multitask experiments
    trainType = 14
    filesNamesSuffix = ""
    if args.action_type >= 0:
        trainType = args.action_type

    makeEnvFunction = makeEnv.make_env_with_best_settings
    if trainType == 1:
        filesNamesSuffix = "balance_"
        makeEnvFunction = makeEnv.make_env_for_balance

    if trainType == 2:
        filesNamesSuffix = "analytical_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical

    if trainType == 3:
        filesNamesSuffix = "analytical2_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical2

    if trainType == 4:
        filesNamesSuffix = "frontback_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_front_back

    if trainType == 5:
        filesNamesSuffix = "leftright_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_left_right

    if trainType == 6:
        filesNamesSuffix = "all_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_all

    if trainType == 7:
        filesNamesSuffix = "rotate_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_rotate

    if trainType == 8:
        filesNamesSuffix = "compound_"
        makeEnvFunction = make_env_multinetwork

    if trainType == 9:
        import pickle
        realEval = False
        allowMutate = False
        args.use_linear_lr_decay = True  #False
        args.num_env_steps = 5000000
        filesNamesSuffix = "test_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_test

    if trainType == 10:
        import pickle
        realEval = False
        allowMutate = False
        args.use_linear_lr_decay = True  #False
        args.num_env_steps = 5000000
        filesNamesSuffix = "zoo_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_test_zoo

    if trainType == 11:
        args.hidden_size = 128  #64 #128
        args.last_hidden_size = args.hidden_size

        import pickle
        if gettrace():
            args.num_processes = 1
        else:
            args.num_processes = 8
        realEval = False
        allowMutate = False
        args.lr = 0.00001
        args.use_linear_lr_decay = True  #False
        args.num_env_steps = 10000000
        filesNamesSuffix = "zigote2_updown_"
        print("Samples preload")
        global samplesEnvData
        samplesEnvData = pickle.load(
            open("./QuadruppedWalk-v1_MoveNoPhys.samples", "rb"))
        # samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1.samples", "rb" ) )
        makeEnvFunction = makeSamplesEnv

    if trainType == 12:
        import pickle
        args.lr = 0.00001
        args.hidden_size = 64
        args.last_hidden_size = args.hidden_size
        filesNamesSuffix = "zigote2_front_back_"
        args.clip_param = 0.9
        args.value_loss_coef = 0.9
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_train
        #makeEnvFunction = makeEnv.make_env_with_best_settings_for_record
        #makeEnv.samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1_MoveNoPhys.samples", "rb" ) )

    if trainType == 13:
        filesNamesSuffix = "all_bytasks_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_all

    if trainType == 14:
        #args.lr = 0.00001
        #args.num_env_steps = 000000
        #args.clip_param = 0.5
        #args.value_loss_coef  =0.8
        #random.seed(time.clock_gettime_ns(time.CLOCK_REALTIME))
        #args.num_steps = random.choice([256,512,1024,2048,4096])
        #args.num_mini_batch = random.choice([32,64,256,512])
        #args.ppo_epoch  = random.choice([2,4,8,10])
        #args.clip_param = random.choice([0.2,0.4,0.6,0.8])
        #args.value_loss_coef  =random.choice([0.4,0.5,0.6,0.8])
        #args.lr = random.choice([0.00001,0.0001,0.00005,0.0005])

        args.num_steps = 2048
        args.num_mini_batch = 64
        args.ppo_epoch = 8
        args.lr = 0.0001

        args.hidden_size = 64
        args.last_hidden_size = args.hidden_size
        #
        filesNamesSuffix = args.filesNamesSuffix
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_all
        '''
        num_steps: 1024 num_mini_batch 64 ppo_epoch 2
        clip_param: 0.2 value_loss_coef 0.6 lr 0.0001
        '''

    if trainType == 15:
        args.num_env_steps = 5000000
        filesNamesSuffix = "zigote_updown_"
        makeEnvFunction = makeEnv.make_env_with_best_settings_for_train_analytic

    if trainType == 16:
        args.lr = 0.00001
        filesNamesSuffix = "compound_tasks_"
        makeEnvFunction = make_env_multinetwork

    reward_shaper = DefaultRewardsShaper(scale_value=reward_shaping)

    print("ActionType ", trainType, " ", filesNamesSuffix, "seed", args.seed,
          "num env steps:", args.num_env_steps, " tasks_dif",
          args.tasks_difficulty_from, args.tasks_difficulty_to)

    print("Num processes:", args.num_processes)

    print("num_steps:", args.num_steps, "num_mini_batch", args.num_mini_batch,
          "ppo_epoch", args.ppo_epoch)
    print("clip_param:", args.clip_param, "value_loss_coef",
          args.value_loss_coef, "lr", args.lr)

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    args.log_dir = "/tmp/tensorboard/"
    #TesnorboardX
    writer = SummaryWriter(log_dir=args.log_dir + 'runs/{}_PPO_{}_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
        "ppo"))

    writer.add_scalar('options/num_steps', args.num_steps, 0)
    writer.add_scalar('options/num_mini_batch', args.num_mini_batch, 0)
    writer.add_scalar('options/ppo_epoch', args.ppo_epoch, 0)
    writer.add_scalar('options/clip_param', args.clip_param, 0)
    writer.add_scalar('options/value_loss_coef', args.value_loss_coef, 0)
    writer.add_scalar('options/lr', args.lr, 0)

    device = torch.device("cuda:0" if args.cuda else "cpu")
    torch.set_num_threads(1)

    load_dir = os.path.join(args.load_dir, args.algo)

    multiNetworkName = ["frontback_", "all_", "leftright_", "rotate_"]
    if trainType == 8:
        for net in multiNetworkName:
            bestFilename = os.path.join(
                load_dir, "{}_{}{}_best.pt".format(args.env_name, net,
                                                   args.hidden_size))
            ac, _ = torch.load(bestFilename)
            policies.append(PPOPlayer(ac, device))
            print("Policy multi loaded: ", bestFilename)

    multiNetworkName2 = [
        "all_bytasks_0_",
        "all_bytasks_1_",
        "all_bytasks_2_",
        "all_bytasks_3_",
        "all_bytasks_4_",
        "all_bytasks_5_",
        "all_bytasks_6_",
        "all_bytasks_7_",
        "all_bytasks_8_",
        "all_bytasks_9_",
        "all_bytasks_10_",
        "all_bytasks_11_",
        "all_bytasks_12_",
    ]
    if trainType == 16:
        for net in multiNetworkName2:
            bestFilename = os.path.join(
                load_dir, "{}_{}{}_best.pt".format(args.env_name, net,
                                                   args.hidden_size))
            ac, _ = torch.load(bestFilename)
            policies.append(PPOPlayer(ac, device))
            print("Policy multi loaded: ", bestFilename)

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         None,
                         device,
                         False,
                         normalizeOb=False,
                         normalizeReturns=False,
                         max_episode_steps=args.num_steps,
                         makeEnvFunc=makeEnvFunction,
                         num_frame_stack=1,
                         info_keywords=(
                             'episode_steps',
                             'episode_reward',
                             'progress',
                             'servo',
                             'distToTarget',
                         ))
    #print(envs.observation_space.shape,envs.action_space)
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy,
                              'hidden_size': args.hidden_size,
                              'last_hidden_size': args.last_hidden_size,
                              'activation_layers_type': "Tanh"
                          })
    '''
#    if args.load_dir not None:
    load_path = os.path.join(args.load_dir, args.algo)
    actor_critic, ob_rms = torch.load(os.path.join(load_path, args.env_name + ".pt"))
    '''
    load_path = os.path.join(
        load_dir, "{}_{}{}_best.pt".format(args.env_name, filesNamesSuffix,
                                           args.hidden_size))
    #load_path = os.path.join(load_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size))
    preptrained_path = "../Train/trained_models/QuadruppedWalk-v1/Train_QuadruppedWalk-v1_256.pth"
    loadPretrained = False
    if loadPretrained and os.path.isfile(preptrained_path):
        print("Load preptrained")
        abj = torch.load(preptrained_path)
        print(abj)
        print(actor_critic.base)
        actor_critic.base.load_state_dict()
        actor_critic.base.eval()
    if os.path.isfile(load_path) and not loadPretrained:
        actor_critic, ob_rms = torch.load(load_path)
        actor_critic.eval()
        print("----NN loaded: ", load_path, " -----")
    else:
        bestFilename = os.path.join(
            load_dir,
            "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix,
                                              args.hidden_size))
        if os.path.isfile(bestFilename):
            actor_critic, ob_rms = torch.load(bestFilename)
            actor_critic.eval()
            print("----NN loaded: ", bestFilename, " -----")

    maxReward = -10000.0
    maxSteps = 0
    minDistance = 50000.0

    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    deque_maxLen = 10

    episode_rewards = deque(maxlen=deque_maxLen)
    episode_steps = deque(maxlen=deque_maxLen)
    episode_rewards_alive = deque(maxlen=deque_maxLen)
    episode_rewards_progress = deque(maxlen=deque_maxLen)
    episode_rewards_servo = deque(maxlen=deque_maxLen)
    episode_dist_to_target = deque(maxlen=deque_maxLen)
    '''
    load_path = os.path.join(args.load_dir, args.algo)
    load_path = os.path.join(load_path, args.env_name + ".pt")
    actor_critic, ob_rms = torch.load(load_path)

    actor_critic.to(device)
    actor_critic.eval()
    #ob_rms.eval()
    '''
    '''
    args.use_gym_monitor = 1
    args.monitor_dir = "./results/"
    monitor_path = os.path.join(args.monitor_dir, args.algo)
    monitor_path = os.path.join(monitor_path, args.env_name)

    args.
    if args.use_gym_monitor:
        env = wrappers.Monitor(
            env, monitor_path, video_callable=False, force=True)
    '''
    i_episode = 0

    save_path = os.path.join(args.save_dir, args.algo)
    try:
        os.makedirs(save_path)
    except OSError:
        pass

    trainOnSamplesAndExit = False  #False
    if trainOnSamplesAndExit:
        import pickle
        print("---------------------------------------")
        print("Samples preload")
        data = pickle.load(open("./QuadruppedWalk-v1_UpDown.samples", "rb"))
        #data = pickle.load( open( "../QuadruppedWalk-v1_NN.samples", "rb" ) )

        learning_rate = 0.0001
        max_episodes = 100
        max_timesteps = 4000
        betas = (0.9, 0.999)
        log_interval = 1

        envSamples = SamplesEnv(data)
        envSamples.numSteps = max_timesteps

        # create a stochastic gradient descent optimizer
        optimizer = torch.optim.Adam(actor_critic.base.actor.parameters(),
                                     lr=learning_rate,
                                     betas=betas)
        #optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9)
        # create a loss function
        criterion = nn.MSELoss(reduction="sum")

        # run the main training loop
        for epoch in range(max_episodes):
            state = envSamples.reset()
            time_step = 0
            testReward = 0
            testSteps = 0
            loss_sum = 0
            loss_max = 0

            for t in range(max_timesteps):
                time_step += 1

                nn_state = torch.FloatTensor((state).reshape(1, -1)).to(device)

                optimizer.zero_grad()
                net_out = actor_critic.base.forwardActor(nn_state)
                net_out = actor_critic.dist.fc_mean(net_out)

                state, reward, done, info = envSamples.step(
                    net_out.detach().numpy())
                sim_action = envSamples.recordedActions

                sim_action_t = torch.FloatTensor([sim_action]).to(device)

                loss = criterion(net_out, sim_action_t)
                loss.backward()
                optimizer.step()
                loss_sum += loss.mean()
                loss_max = max(loss_max, loss.max())

                testReward += reward
                testSteps += 1

                if done:
                    if epoch % log_interval == 0:
                        #print(best_action_t*scaleActions-net_out*scaleActions)
                        if args.verboseLevel > 0:
                            print(
                                'Train Episode: {} t:{} Reward:{} Loss: mean:{:.6f} max: {:.6f}'
                                .format(epoch, t, testReward, loss_sum / t,
                                        loss_max))
                            print(info)
                        reward = 0
                    break
        bestFilename = os.path.join(
            save_path,
            "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix,
                                              args.hidden_size))
        torch.save([
            actor_critic,
            getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
        ], bestFilename)
        exit(0)

    skipWriteBest = True

    if args.verboseLevel > 0:
        printNetwork(actor_critic.base.actor)

    lock(actor_critic, first=False, last=False)
    #if trainType==9:
    #allowMutate = False
    #lock(actor_critic,first=True,last=False)
    #mutate(actor_critic,power=0.00,powerLast=0.3)

    if args.verboseLevel > 0:
        printNetwork(actor_critic.base.actor)
    #from torchsummary import summary

    #summary(actor_critic.base.actor, (1, 48, 64))

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    episodeBucketIndex = 0

    maxReward = -10000000000
    numEval = 10
    if realEval:
        envEval = makeEnvFunction(args.env_name)
        if hasattr(envEval.env, "tasks") and len(envEval.env.tasks):
            numEval = max(numEval, len(envEval.env.tasks))
        maxReward = evaluate_policy(envEval,
                                    actor_critic,
                                    numEval * 2,
                                    render=False,
                                    device=device,
                                    verbose=args.verboseLevel)
        print("MaxReward on start", maxReward)

    noMaxRewardCount = 0

    updateIndex = 0

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        episode_r = 0.0
        stepsDone = 0

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            #envs.venv.venv.venv.envs[0].render()

            if args.verboseLevel > 0:
                index = 0
                for d in done:
                    if d:
                        print(infos[index], flush=True)
                    index += 1

            episodeDone = False
            '''
            index = 0
            for d in done:
                if d:
                    print("")
                    print(infos[index])
                index+=1
            '''

            for info in infos:
                if 'reward' in info.keys():
                    episodeDone = True
                    i_episode += 1
                    episode_rewards.append(info['reward'])
                    writer.add_scalar('reward/episode', info['reward'],
                                      i_episode)
                    #print("E:",i_episode," T:",info['episode_steps'], " R:", info['episode_reward'], " D:",info['distToTarget'])
                if 'steps' in info.keys():
                    episode_steps.append(info['steps'])
                    writer.add_scalar('reward/steps', info['steps'], i_episode)
                if 'alive' in info.keys():
                    episode_rewards_alive.append(info['alive'])
                    writer.add_scalar('reward/alive', info['alive'], i_episode)
                if 'prog' in info.keys():
                    episode_rewards_progress.append(info['prog'])
                    writer.add_scalar('reward/progress', info['prog'],
                                      i_episode)
                if 'servo' in info.keys():
                    episode_rewards_servo.append(info['servo'])
                    writer.add_scalar('reward/servo', info['servo'], i_episode)
                if 'd2T' in info.keys():
                    episode_dist_to_target.append(info['d2T'])
                    writer.add_scalar('reward/distToTarget', info['d2T'],
                                      i_episode)

                for val in info.keys():
                    if val not in [
                            "reward", "steps", "alive", "prog", "servo", "d2T",
                            'epos', 't'
                    ]:
                        writer.add_scalar('reward/' + val, info[val],
                                          i_episode)

            #if episodeDone and i_episode%10==0:
            #    print(i_episode,"({:.1f}/{}/{:.2f}) ".format(episode_rewards[-1],episode_steps[-1],episode_dist_to_target[-1]),end='',flush=True)

            if episodeDone:
                episodeBucketIndex += 1
                if args.verboseLevel > 0:
                    print("Mean:", Fore.WHITE, np.mean(episode_rewards),
                          Style.RESET_ALL, " Median:", Fore.WHITE,
                          np.median(episode_rewards), Style.RESET_ALL,
                          " max reward:", maxReward)

                #'''len(episode_rewards) and np.mean(episode_rewards)>maxReward and'''
                if realEval:
                    if episodeBucketIndex % args.log_interval == 0 and episodeBucketIndex > args.log_interval:
                        print("Step:",
                              (j + 1) * args.num_processes * args.num_steps)
                        if skipWriteBest == False:
                            evalReward = evaluate_policy(
                                envEval,
                                actor_critic,
                                numEval,
                                device=device,
                                verbose=args.verboseLevel)

                            writer.add_scalar('reward/eval', evalReward,
                                              i_episode)

                            if evalReward > maxReward:
                                maxReward = evalReward
                                #maxReward = np.mean(episode_rewards)

                                bestFilename = os.path.join(
                                    save_path, "{}_{}{}_best.pt".format(
                                        args.env_name, filesNamesSuffix,
                                        args.hidden_size))
                                print(
                                    "Writing best reward:", Fore.GREEN,
                                    "({:.1f}/{:.1f}/{:.1f}/{}/{:.2f}) ".format(
                                        maxReward, np.mean(episode_rewards),
                                        np.median(episode_rewards),
                                        np.mean(episode_steps),
                                        episode_dist_to_target[-1]),
                                    Style.RESET_ALL, bestFilename)
                                torch.save([
                                    actor_critic,
                                    getattr(utils.get_vec_normalize(envs),
                                            'ob_rms', None)
                                ], bestFilename)
                                noMaxRewardCount = 0
                            else:
                                noMaxRewardCount += 1
                                if allowMutate:
                                    if noMaxRewardCount == 5:
                                        print("Mutation low last layer")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.00,
                                               powerLast=0.01)
                                    if noMaxRewardCount == 8:
                                        print("Mutation low non last")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.01,
                                               powerLast=0.0)
                                    if noMaxRewardCount == 11:
                                        print("Mutation low all")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.02,
                                               powerLast=0.2)
                                    if noMaxRewardCount == 14:
                                        print("Mutation hi all")
                                        lock(actor_critic,
                                             first=False,
                                             last=False)
                                        mutate(actor_critic,
                                               power=0.03,
                                               powerLast=0.03)
                                        noMaxRewardCount = 0
                                if noMaxRewardCount == args.nobest_exit:
                                    exit(0)
                        else:
                            skipWriteBest = False
                else:
                    if len(episode_rewards) and np.mean(
                            episode_rewards
                    ) > maxReward and j > args.log_interval:
                        if skipWriteBest == False:
                            maxReward = np.mean(episode_rewards)
                            writer.add_scalar('reward/maxReward', maxReward,
                                              i_episode)

                            bestFilename = os.path.join(
                                save_path, "{}_{}{}_best.pt".format(
                                    args.env_name, filesNamesSuffix,
                                    args.hidden_size))
                            if len(episode_dist_to_target):
                                print(
                                    "Writing best reward:", Fore.GREEN,
                                    "({:.1f}/{:.1f}/{}/{:.2f}) ".format(
                                        np.mean(episode_rewards),
                                        np.median(episode_rewards),
                                        np.mean(episode_steps),
                                        episode_dist_to_target[-1]),
                                    Style.RESET_ALL, bestFilename)
                            else:
                                print(
                                    "Writing best reward:", Fore.GREEN,
                                    "({:.1f}/{:.1f}/{}) ".format(
                                        np.mean(episode_rewards),
                                        np.median(episode_rewards),
                                        np.mean(episode_steps)),
                                    Style.RESET_ALL, bestFilename)

                            torch.save([
                                actor_critic,
                                getattr(utils.get_vec_normalize(envs),
                                        'ob_rms', None)
                            ], bestFilename)
                        else:
                            skipWriteBest = False
            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            shaped_reward = reward_shaper(reward)
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, shaped_reward, masks,
                            bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        writer.add_scalar('reward/value_loss', value_loss, updateIndex)
        writer.add_scalar('reward/action_loss', action_loss, updateIndex)
        writer.add_scalar('reward/dist_entropy', dist_entropy, updateIndex)

        updateIndex += 1

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            '''
            fileName = os.path.join(save_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size))
            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], fileName)
            print("Saved:",fileName, " cur avg rewards:",np.mean(episode_rewards))

            fileName = os.path.join(save_path, "{}_{}{}_actor.pt".format(args.env_name,filesNamesSuffix,args.hidden_size))
            torch.save(actor_critic.state_dict, fileName)
            print("Saved:",fileName)
            '''
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            if args.verboseLevel > 0:
                print("")
                print("Updates {}, num timesteps {}, FPS {}".format(
                    j, total_num_steps, int(total_num_steps / (end - start))))
                print(" Last {} training episodes:".format(
                    len(episode_rewards)))

                print(
                    " reward mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}".
                    format(np.mean(episode_rewards),
                           np.median(episode_rewards), np.min(episode_rewards),
                           np.max(episode_rewards)))

                print(" steps mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}".
                      format(np.mean(episode_steps), np.median(episode_steps),
                             np.min(episode_steps), np.max(episode_steps)))

                if len(episode_rewards_alive):
                    print(
                        " alive mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}"
                        .format(np.mean(episode_rewards_alive),
                                np.median(episode_rewards_alive),
                                np.min(episode_rewards_alive),
                                np.max(episode_rewards_alive)))

                if len(episode_rewards_progress):
                    print(
                        " progress mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}"
                        .format(np.mean(episode_rewards_progress),
                                np.median(episode_rewards_progress),
                                np.min(episode_rewards_progress),
                                np.max(episode_rewards_progress)))

                if len(episode_rewards_servo):
                    print(
                        " servo mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}"
                        .format(np.mean(episode_rewards_servo),
                                np.median(episode_rewards_servo),
                                np.min(episode_rewards_servo),
                                np.max(episode_rewards_servo)))

                if len(episode_dist_to_target):
                    print(
                        " dist to target mean/median {:.3f}/{:.3f} min/max {:.3f}/{:.3f}"
                        .format(np.mean(episode_dist_to_target),
                                np.median(episode_dist_to_target),
                                np.min(episode_dist_to_target),
                                np.max(episode_dist_to_target)))

                print(
                    " Reward/Steps {:.3f} Progress/Steps: {:.3f} entropy {:.1f} value_loss {:.5f} action_loss {:.5f}\n"
                    .format(
                        np.mean(episode_rewards) / np.mean(episode_steps),
                        (0 if len(episode_rewards_progress) == 0 else
                         np.mean(episode_rewards_progress) /
                         np.mean(episode_steps)), dist_entropy, value_loss,
                        action_loss))
Пример #23
0
def train(train_states,
          run_dir,
          num_env_steps,
          eval_env_steps,
          writer,
          writer_name,
          args,
          init_model=None):
    envs = make_vec_envs(train_states, args.seed, args.num_processes,
                         args.gamma, 'cpu', 'train', args)

    if init_model:
        actor_critic, env_step, model_name = init_model
        obs_space = actor_critic.obs_space
        obs_process = actor_critic.obs_process
        obs_module = actor_critic.obs_module
        print(f"  [load] Loaded model {model_name} at step {env_step}")
    else:
        obs_space = envs.observation_space
        actor_critic = Policy(obs_space,
                              args.obs_process,
                              args.obs_module,
                              envs.action_space,
                              base_kwargs={'recurrent': args.recurrent_policy})
        env_step = 0
    actor_critic.to(args.device)
    #print(actor_critic)

    run_name = run_dir.replace('/', '_')
    vid_save_dir = f"{run_dir}/videos/"
    try:
        os.makedirs(vid_save_dir)
    except OSError:
        pass
    ckpt_save_dir = f"{run_dir}/ckpts/"
    try:
        os.makedirs(ckpt_save_dir)
    except OSError:
        pass

    if args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         args.device,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm,
                               acktr=False)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm,
                               acktr=True)
    else:
        raise NotImplementedError

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    actor_critic.eval()
    """
    try:
        writer.add_graph(actor_critic, obs)
    except ValueError:
        print("Unable to write model graph to tensorboard.")
    """
    actor_critic.train()

    for k in rollouts.obs.keys():
        rollouts.obs[k][0].copy_(obs[k][0])

    episode_rewards = deque(maxlen=10)

    num_updates = num_env_steps // args.num_steps // args.num_processes
    batch_size = args.num_steps * args.num_processes
    start = time.time()
    while env_step < num_env_steps:
        s = time.time()
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states, _ = actor_critic.act(
                    {
                        k: rollouts.obs[k][step].float().to(args.device)
                        for k in rollouts.obs.keys()
                    }, rollouts.recurrent_hidden_states[step].to(args.device),
                    rollouts.masks[step].to(args.device))
                value = value.cpu()
                action = action.cpu()
                action_log_prob = action_log_prob.cpu()
                recurrent_hidden_states = recurrent_hidden_states.cpu()
            # Observe reward and next obs
            obs, reward, dones, infos = envs.step(action)

            for done, info in zip(dones, infos):
                env_state = info['env_state'][1]
                if done:
                    writer.add_scalar(f'train_episode_x/{env_state}',
                                      info['max_x'], env_step)
                    writer.add_scalar(f'train_episode_%/{env_state}',
                                      info['max_x'] / info['lvl_max_x'] * 100,
                                      env_step)
                    writer.add_scalar(f'train_episode_r/{env_state}',
                                      info['sum_r'], env_step)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done else [1.0]
                                       for done in dones])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)
        with torch.no_grad():
            next_value = actor_critic.get_value(
                {
                    k: rollouts.obs[k][-1].float().to(args.device)
                    for k in rollouts.obs.keys()
                }, rollouts.recurrent_hidden_states[-1].to(args.device),
                rollouts.masks[-1].to(args.device)).detach().cpu()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        env_step += batch_size
        fps = batch_size / (time.time() - s)
        #res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
        #writer.add_scalar(f'gpu_usage/{writer_name}', res.gpu, env_step)
        #writer.add_scalar(f'gpu_mem/{writer_name}', res.memory, env_step)
        total_norm = 0
        for p in list(
                filter(lambda p: p.grad is not None,
                       actor_critic.parameters())):
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item()**2
        total_norm = total_norm**(1. / 2)
        obs_norm = {}
        for obs_name in args.obs_keys:
            t_norm = 0
            if obs_name == 'video':
                md = actor_critic.base.video_module
            elif obs_name == 'audio':
                md = actor_critic.base.audio_module
            else:
                raise NotImplementedError
            for p in list(filter(lambda p: p.grad is not None,
                                 md.parameters())):
                param_norm = p.grad.data.norm(2)
                t_norm += param_norm.item()**2
            obs_norm[obs_name] = t_norm**(1. / 2)

        prev_env_step = max(0, env_step + 1 - batch_size)
        # write training metrics for this batch, usually takes 0.003s
        if (env_step + 1
            ) // args.write_interval > prev_env_step // args.write_interval:
            writer.add_scalar(f'grad_norm/{writer_name}', total_norm, env_step)
            writer.add_scalar(f'fps/{writer_name}', fps, env_step)
            writer.add_scalar(f'value_loss/{writer_name}',
                              value_loss / batch_size, env_step)
            writer.add_scalar(f'action_loss/{writer_name}',
                              action_loss / batch_size, env_step)
            writer.add_scalar(f'dist_entropy/{writer_name}',
                              dist_entropy / batch_size, env_step)
            writer.add_scalar(f'cpu_usage/{writer_name}', psutil.cpu_percent(),
                              env_step)
            writer.add_scalar(f'cpu_mem/{writer_name}',
                              psutil.virtual_memory()._asdict()['percent'],
                              env_step)
            for obs_name in args.obs_keys:
                writer.add_scalar(f'grad_norm_{obs_name}/{writer_name}',
                                  obs_norm[obs_name], env_step)

        # print log to console
        if (env_step +
                1) // args.log_interval > prev_env_step // args.log_interval:
            end = time.time()
            print("  [log] Env step {} of {}: {:.1f}s, {:.1f}fps".format(
                env_step + 1, num_env_steps, end - start, fps))
            if len(episode_rewards) > 0:
                print(
                    "    Last {} episodes: mean/med reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                    .format(len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards),
                            np.min(episode_rewards), np.max(episode_rewards)))
            print(
                "    dist_entropy {:.5f}, value_loss {:.6f}, action_loss {:.6f}, grad_norm {:.6f}"
                .format(dist_entropy, value_loss, action_loss, total_norm))
            start = time.time()

        # save model to ckpt
        if ((env_step + 1) // args.save_interval >
                prev_env_step // args.save_interval):
            torch.save([
                actor_critic,
                env_step,
                run_name,
            ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt"))
            print(f"  [save] Saved model at step {env_step+1}.")

        # save model to ckpt and run evaluation if eval_interval and not final iteration in training loop
        if ((env_step + 1) // args.eval_interval >
                prev_env_step // args.eval_interval
            ) and env_step < num_env_steps and eval_env_steps > 0:
            torch.save([
                actor_critic,
                env_step,
                run_name,
            ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt"))
            print(f"  [save] Saved model at step {env_step+1}.")

            envs.close()
            del envs  # close does not actually get rid of envs, need to del
            actor_critic.eval()
            eval_score, e_dict = evaluate(train_states, actor_critic,
                                          eval_env_steps, env_step, writer,
                                          vid_save_dir, args.vid_tb_steps,
                                          args.vid_file_steps,
                                          args.obs_viz_layer, args)
            print(f"  [eval] Evaluation score: {eval_score}")
            writer.add_scalar('eval_score', eval_score, env_step)

            actor_critic.train()
            envs = make_vec_envs(train_states, args.seed, args.num_processes,
                                 args.gamma, 'cpu', 'train', args)
            obs = envs.reset()
            # TODO: does this work? do we need to increment env step or something? whydden_states insert at 0
            for k in rollouts.obs.keys():
                rollouts.obs[k][0].copy_(obs[k][0])

    # final model save
    final_model_path = os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt")
    torch.save([
        actor_critic,
        env_step,
        run_name,
    ], final_model_path)
    print(
        f"  [save] Final model saved at step {env_step+1} to {final_model_path}"
    )

    # final model eval
    envs.close()
    del envs
    eval_score = None
    eval_dict = None
    if eval_env_steps > 0:
        eval_score, eval_dict = evaluate(train_states, actor_critic,
                                         eval_env_steps, env_step, writer,
                                         vid_save_dir, args.vid_tb_steps,
                                         args.vid_file_steps,
                                         args.obs_viz_layer, args)
        print(f"  [eval] Final model evaluation score: {eval_score:.3f}")

    return (actor_critic, env_step, run_name), eval_score, eval_dict
Пример #24
0
def pg(envs, printout, use_gail=False):

    if use_gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail_util.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            '/home/paperspace/repos/pytorch-a2c-ppo-acktr-gail/gail_experts',
            "trajs_reacher.pt")

        gail_train_loader = torch.utils.data.DataLoader(
            gail_util.ExpertDataset(file_name,
                                    num_trajectories=4,
                                    subsample_step=4),
            batch_size=ppo_args.gail_batchsize,
            shuffle=True,
            drop_last=True)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space)
    actor_critic.to(device)
    agent = algo.PPO(actor_critic=actor_critic,
                     clip_param=ppo_args.clip_param,
                     ppo_epoch=ppo_args.ppo_epoch,
                     num_mini_batch=ppo_args.num_mb,
                     value_loss_coef=ppo_args.vloss_coef,
                     entropy_coef=ppo_args.entropy_coef,
                     lr=ppo_args.lr,
                     eps=ppo_args.adam_eps,
                     max_grad_norm=.5)

    rollouts = storage.RolloutStorage(ppo_args.num_steps,
                                      ppo_args.num_processes,
                                      envs.observation_space.shape,
                                      envs.action_space,
                                      actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    num_updates = int(
        ppo_args.total_steps) // ppo_args.num_steps // ppo_args.num_processes

    episode_rewards = deque(maxlen=10)
    scores = np.zeros((ppo_args.num_envs, 1))
    final_scores = np.zeros((ppo_args.num_envs, 1))
    start = timer()
    for j in range(num_updates):

        utils.update_linear_schedule(agent.optimizer, j, num_updates,
                                     ppo_args.lr)

        for step in range(ppo_args.num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.ones_like(masks)
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        if use_gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = ppo_args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(ppo_args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], ppo_args.gamma,
                    rollouts.masks[step])

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, ppo_args.use_gae, ppo_args.gamma,
                                 ppo_args.gae_lambda, ppo_args.time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        save_path = 'saved_models'
        save_interval = 100
        # save for every interval-th update or for the last epoch
        if (j % save_interval == 0 or j == num_updates - 1):

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, "ppo" + env_name + ".pt"))

        log_interval = 10
        if j % log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j +
                               1) * ppo_args.num_processes * ppo_args.num_steps
            end = timer()

            printout(
                f'Updates {j}, num timesteps {total_num_steps}, FPS { int(total_num_steps / (end - start))} \n '
                f'Last {len(episode_rewards)} training episodes: mean/median reward {np.mean(episode_rewards):.1f}/{ np.median(episode_rewards):.1f}, '
                f'min/max reward {np.min(episode_rewards):.1f}/{np.max(episode_rewards):.1f}'
            )
Пример #25
0
def main():
    all_episode_rewards = []  ### 记录 6/29
    all_temp_rewards = []  ### 记录 6/29
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    print('num_updates ', num_updates)
    print('num_steps ', args.num_steps)
    count = 0
    h5_path = './data/' + args.env_name
    if not os.path.exists(h5_path):
        os.makedirs(h5_path)
    h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % (count)
    data = {}
    data['states'] = []
    data['actions'] = []
    data['rewards'] = []
    data['done'] = []
    data['lengths'] = []

    episode_step = 0

    for j in range(num_updates):  ### num-steps

        temp_states = []
        temp_actions = []
        temp_rewards = []
        temp_done = []
        temp_lenthgs = []

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            if j == 0 and step == 0:
                print('obs ', type(rollouts.obs[step]),
                      rollouts.obs[step].shape)
                print('hidden_states ',
                      type(rollouts.recurrent_hidden_states[step]),
                      rollouts.recurrent_hidden_states[step].shape)
                print('action ', type(action), action.shape)
                print('action prob ', type(action_log_prob),
                      action_log_prob.shape)
                print('-' * 20)

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            #print(infos)
            #print(reward)
            temp_states += [np.array(rollouts.obs[step].cpu())]
            temp_actions += [np.array(action.cpu())]
            #temp_rewards += [np.array(reward.cpu())]
            temp_rewards += [np.array([infos[0]['myrewards']])
                             ]  ### for halfcheetah不能直接用 reward !! 6/29
            temp_done += [np.array(done)]

            if j == 0 and step == 0:
                print('obs ', type(obs), obs.shape)
                print('reward ', type(reward), reward.shape)
                print('done ', type(done), done.shape)
                print('infos ', len(infos))
                for k, v in infos[0].items():
                    print(k, v.shape)
                print()

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    all_episode_rewards += [info['episode']['r']]  ### 记录 6/29

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        temp_lengths = len(temp_states)
        temp_states = np.concatenate(temp_states)
        temp_actions = np.concatenate(temp_actions)
        temp_rewards = np.concatenate(temp_rewards)
        temp_done = np.concatenate(temp_done)
        #print('temp_lengths',temp_lengths)
        #print('temp_states', temp_states.shape)
        #print('temp_actions', temp_actions.shape)
        #print('temp_rewards', temp_rewards.shape)
        if j > int(0.4 * num_updates):
            data['states'] += [temp_states]
            data['actions'] += [temp_actions]
            data['rewards'] += [temp_rewards]
            data['lengths'] += [temp_lengths]
            data['done'] += [temp_done]
            #print('temp_lengths',data['lengths'].shape)
            #print('temp_states', data['states'].shape)
            #print('temp_actions', data['actions'].shape)
            #print('temp_rewards', data['rewards'].shape)

            if args.save_expert and len(data['states']) >= 100:
                with h5py.File(h5_filename, 'w') as f:
                    f['states'] = np.array(data['states'])
                    f['actions'] = np.array(data['actions'])
                    f['rewards'] = np.array(data['rewards'])
                    f['done'] = np.array(data['done'])
                    f['lengths'] = np.array(data['lengths'])
                    #print('f_lengths',f['lengths'].shape)
                    #print('f_states', f['states'].shape)
                    #print('f_actions', f['actions'].shape)
                    #print('f_rewards', f['rewards'].shape)

                count += 1
                h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % (
                    count)
                data['states'] = []
                data['actions'] = []
                data['rewards'] = []
                data['done'] = []
                data['lengths'] = []

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + "_%d.pt" % (args.seed)))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            #np.save(os.path.join(save_path, args.env_name+"_%d"%(args.seed)), all_episode_rewards)  ### 保存记录 6/29
            #print(temp_rewards)
            print("temp rewards size", temp_rewards.shape, "mean",
                  np.mean(temp_rewards), "min", np.min(temp_rewards), "max",
                  np.max(temp_rewards))
            all_temp_rewards += [temp_rewards]
            np.savez(os.path.join(save_path,
                                  args.env_name + "_%d" % (args.seed)),
                     episode=all_episode_rewards,
                     timestep=all_temp_rewards)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
    '''data['states'] = np.array(data['states'])
Пример #26
0
def main():
    import copy
    import glob
    import os
    import time
    from collections import deque

    import gym
    import numpy as np
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim

    from a2c_ppo_acktr import algo
    from a2c_ppo_acktr.envs import make_vec_envs
    from a2c_ppo_acktr.storage import RolloutStorage
    from a2c_ppo_acktr.utils import get_vec_normalize, update_linear_schedule
    from a2c_ppo_acktr.visualize import visdom_plot

    device = torch.device('cuda')

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    print(envs.observation_space.shape)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space)

    actor_critic.to(device)

    agent = ProximalPolicyOptimization(actor_critic,
                                       args.clip_param,
                                       args.ppo_epoch,
                                       args.num_mini_batch,
                                       args.value_loss_coef,
                                       args.entropy_coef,
                                       lr=args.lr,
                                       eps=args.eps,
                                       max_grad_norm=args.max_grad_norm,
                                       chrono=chrono)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    start = time.time()
    for j in range(args.repeat):
        with chrono.time('train', verbose=True) as t:
            for n in range(args.number):

                with chrono.time('one_batch', verbose=True):

                    if args.use_linear_lr_decay:
                        # decrease learning rate linearly
                        if args.algo == "acktr":
                            # use optimizer's learning rate since it's hard-coded in kfac.py
                            update_linear_schedule(agent.optimizer, j,
                                                   num_updates,
                                                   agent.optimizer.lr)
                        else:
                            update_linear_schedule(agent.optimizer, j,
                                                   num_updates, args.lr)

                    if args.algo == 'ppo' and args.use_linear_clip_decay:
                        agent.clip_param = args.clip_param * (
                            1 - j / float(num_updates))

                    with chrono.time('generate_rollouts', verbose=True):
                        generate_rollouts(**locals())

                        with torch.no_grad():
                            next_value = actor_critic.get_value(
                                rollouts.obs[-1],
                                rollouts.recurrent_hidden_states[-1],
                                rollouts.masks[-1]).detach()

                    # ---
                    with chrono.time('compute_returns', verbose=True):
                        rollouts.compute_returns(next_value, args.use_gae,
                                                 args.gamma, args.tau)

                    with chrono.time('agent.update',
                                     verbose=True):  # 11.147009023304644
                        value_loss, action_loss, dist_entropy = agent.update(
                            rollouts)

                        #exp.log_batch_loss(action_loss)
                        #exp.log_metric('value_loss', value_loss)

                    with chrono.time('after_update', verbose=True):
                        rollouts.after_update()

                    total_num_steps = (j +
                                       1) * args.num_processes * args.num_steps
Пример #27
0
def train_ppo_fine_tune_joint(args):

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(2)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, True)

    actor_critic = Policy(  # 2-layer fully connected network
        envs.observation_space.shape,
        envs.action_space,
        base_kwargs={
            'recurrent': False,
            'hidden_size': 32
        })
    # behavioral cloning
    model = PusherPolicyModel()
    num_epochs = 20
    model.train(num_epochs=num_epochs)

    actor_critic.base.actor[0].weight.data.copy_(model.net.fc1.weight.data)
    actor_critic.base.actor[0].bias.data.copy_(model.net.fc1.bias.data)
    actor_critic.base.actor[2].weight.data.copy_(model.net.fc2.weight.data)
    actor_critic.base.actor[2].bias.data.copy_(model.net.fc2.bias.data)
    actor_critic.base.critic[0].weight.data.copy_(model.net.fc1.weight.data)
    actor_critic.base.critic[0].bias.data.copy_(model.net.fc1.bias.data)
    actor_critic.base.critic[2].weight.data.copy_(model.net.fc2.weight.data)
    actor_critic.base.critic[2].bias.data.copy_(model.net.fc2.bias.data)
    actor_critic.dist.fc_mean.weight.data.copy_(model.net.fc3.weight.data)
    actor_critic.dist.fc_mean.bias.data.copy_(model.net.fc3.bias.data)

    actor_critic.to(device)

    dataset = np.load('./expert.npz')
    obs_expert = torch.Tensor(dataset['obs'])
    actions_expert = torch.Tensor(dataset['action'])

    obs_expert.to(device)
    actions_expert.to(device)

    joint_loss_coef = 0.03

    agent = PPOJointLoss(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         joint_loss_coef=joint_loss_coef,
                         obs_expert=obs_expert,
                         actions_expert=actions_expert,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    episode_reward_means = []
    episode_reward_times = []

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(agent.optimizer, j, num_updates,
                                         args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

            episode_reward_means.append(np.mean(episode_rewards))
            episode_reward_times.append(total_num_steps)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)

    print(episode_reward_means, episode_reward_times)

    return episode_reward_means, episode_reward_times
Пример #28
0
    def train_agent(self, num_env_steps):
        env_name = self.env_def.name

        obs = self.envs.reset()
        self.rollouts.obs[0].copy_(obs)
        self.rollouts.to(self.device)

        n = 30
        episode_rewards = deque(maxlen=n)
        episode_values = deque(maxlen=n)
        episode_end_values = deque(maxlen=n)
        episode_end_probs = deque(maxlen=n)
        episode_lengths = deque(maxlen=n)
        compile_est = deque(maxlen=n)
        first_steps = [True for i in range(self.num_processes)]

        start = time.time()
        num_updates = int(
            num_env_steps) // self.num_steps // self.num_processes
        for j in range(num_updates):

            if self.use_linear_lr_decay:
                # decrease learning rate linearly
                utils.update_linear_schedule(
                    self.agent.optimizer, j, num_updates,
                    self.agent.optimizer.lr
                    if self.algo == "acktr" else self.lr)

            for step in range(self.num_steps):
                # Sample actions
                with torch.no_grad():
                    value, Q, action, action_prob, action_log_prob, recurrent_hidden_states = \
                        self.actor_critic.act(self.rollouts.obs[step],
                            self.rollouts.recurrent_hidden_states[step],
                            self.rollouts.masks[step])

                # Observe reward and next obs
                obs, reward, done, infos = self.envs.step(action)

                for i, step in enumerate(first_steps):
                    if step:
                        episode_values.append(value[i].item())
                    elif (done[i]):
                        episode_end_values.append(Q[i].item())
                        episode_end_probs.append(action_log_prob[i].item())
                first_steps = done

                for worker, info in enumerate(infos):
                    if 'episode' in info.keys():
                        r = info['episode']['r']
                        l = info['episode']['l']
                        episode_rewards.append(r)
                        episode_lengths.append(l)
                        if (r < -1):
                            compile_est.append(value[worker].item())

                # If done then clean the history of observations.
                masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                           for done_ in done])
                bad_masks = torch.FloatTensor(
                    [[0.0] if 'bad_transition' in info.keys() else [1.0]
                     for info in infos])
                self.rollouts.insert(obs, recurrent_hidden_states, action,
                                     action_prob, action_log_prob, value, Q,
                                     reward, masks, bad_masks)

            with torch.no_grad():
                next_value = self.actor_critic.get_value(
                    self.rollouts.obs[-1],
                    self.rollouts.recurrent_hidden_states[-1],
                    self.rollouts.masks[-1]).detach()

            if self.gail:
                if j >= 10:
                    self.envs.venv.eval()

                gail_epoch = self.gail_epoch
                if j < 10:
                    gail_epoch = 100  # Warm up
                for _ in range(gail_epoch):
                    self.gail_discr.update(
                        self.gail_train_loader, self.rollouts,
                        utils.get_vec_normalize(self.envs)._obfilt)

                for step in range(self.num_steps):
                    self.rollouts.rewards[
                        step] = self.gail_discr.predict_reward(
                            self.rollouts.obs[step],
                            self.rollouts.actions[step], self.gamma,
                            self.rollouts.masks[step])

            self.rollouts.compute_returns(next_value, self.use_gae, self.gamma,
                                          self.gae_lambda,
                                          self.use_proper_time_limits)

            value_loss, action_loss, dist_entropy = self.agent.update(
                self.rollouts)
            if (self.reconstruct):
                recon_loss = self.update_reconstruction(self.rollouts)
                self.writer.add_scalar('generator/Reconstruction Loss',
                                       recon_loss.item(), self.total_steps)

            self.rollouts.after_update()

            #Tensorboard Reporting
            self.total_steps += self.num_processes * self.num_steps
            self.writer.add_scalar('value/Mean Reward',
                                   np.mean(episode_rewards), self.total_steps)
            self.writer.add_scalar('value/Episode Mean Length',
                                   np.mean(episode_lengths), self.total_steps)
            self.writer.add_scalar('policy/Action Loss', action_loss,
                                   self.total_steps)
            self.writer.add_scalar('value/Value Loss', value_loss,
                                   self.total_steps)
            self.writer.add_scalar('policy/Distribution Entropy', dist_entropy,
                                   self.total_steps)
            self.writer.add_scalar('value/Win Probability',
                                   np.mean(np.array(episode_rewards) > 0),
                                   self.total_steps)
            self.writer.add_scalar('value/Starting Value',
                                   np.mean(episode_values), self.total_steps)
            #self.writer.add_scalar('value/Ending Value', np.mean(episode_end_values), self.total_steps)
            self.writer.add_scalar('value/Log Probs',
                                   np.mean(episode_end_probs),
                                   self.total_steps)
            if (len(compile_est) > 0):
                self.writer.add_scalar('value/Compile Estimate',
                                       np.mean(compile_est), self.total_steps)

            # save for every interval-th episode or for the last epoch
            total_num_steps = (j + 1) * self.num_processes * self.num_steps
            end = time.time()
            if (j % self.save_interval == 0
                    or j == num_updates - 1) and self.save_dir != "":
                self.version += 1
                #self.save(self.version)
                self.report(self.version, total_num_steps,
                            int(total_num_steps / (end - start)),
                            episode_rewards)

            if j % self.log_interval == 0 and len(episode_rewards) > 1:
                print(
                    "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                    .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards),
                            np.min(episode_rewards), np.max(episode_rewards),
                            dist_entropy, value_loss, action_loss))
Пример #29
0
def main():

    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy,
                              'hidden_size': args.hidden_size
                          })
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.gail:
        assert len(envs.observation_space.shape) == 1
        discr = gail.Discriminator(
            envs.observation_space.shape[0] + envs.action_space.shape[0], 100,
            device)
        file_name = os.path.join(
            args.gail_experts_dir,
            "trajs_{}.pt".format(args.env_name.split('-')[0].lower()))

        gail_train_loader = torch.utils.data.DataLoader(
            gail.ExpertDataset(file_name,
                               num_trajectories=4,
                               subsample_frequency=20),
            batch_size=args.gail_batch_size,
            shuffle=True,
            drop_last=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    rewards = []

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            rewards.extend(reward)
            for info in infos:
                if 'episode' in info.keys():
                    # print( info['episode'] )
                    episode_rewards.append(info['episode']['r'] /
                                           info['episode']['l'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.gail:
            if j >= 10:
                envs.venv.eval()

            gail_epoch = args.gail_epoch
            if j < 10:
                gail_epoch = 100  # Warm up
            for _ in range(gail_epoch):
                discr.update(gail_train_loader, rollouts,
                             utils.get_vec_normalize(envs)._obfilt)

            for step in range(args.num_steps):
                rollouts.rewards[step] = discr.predict_reward(
                    rollouts.obs[step], rollouts.actions[step], args.gamma,
                    rollouts.masks[step])

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            # print( episode_rewards )
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.3f}/{:.3f}, min/max reward {:.3f}/{:.3f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
            # print( list( map( lambda x : x["score"] , infos ) ) )
            rewards = []

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Пример #30
0
def main():
    chrono = exp.chrono()

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = PPO(actor_critic,
                    args.clip_param,
                    args.ppo_epoch,
                    args.num_mini_batch,
                    args.value_loss_coef,
                    args.entropy_coef,
                    lr=args.lr,
                    eps=args.eps,
                    max_grad_norm=args.max_grad_norm,
                    chrono=chrono)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(args.repeat):
        with chrono.time('train') as t:
            for n in range(args.number):

                with chrono.time('one_batch'):

                    if args.use_linear_lr_decay:
                        # decrease learning rate linearly
                        if args.algo == "acktr":
                            # use optimizer's learning rate since it's hard-coded in kfac.py
                            update_linear_schedule(agent.optimizer, j,
                                                   num_updates,
                                                   agent.optimizer.lr)
                        else:
                            update_linear_schedule(agent.optimizer, j,
                                                   num_updates, args.lr)

                    if args.algo == 'ppo' and args.use_linear_clip_decay:
                        agent.clip_param = args.clip_param * (
                            1 - j / float(num_updates))

                    with chrono.time('generate_rollouts'):
                        generate_rollouts(**locals())

                        with torch.no_grad():
                            next_value = actor_critic.get_value(
                                rollouts.obs[-1],
                                rollouts.recurrent_hidden_states[-1],
                                rollouts.masks[-1]).detach()

                    # ---
                    with chrono.time('compute_returns'):
                        rollouts.compute_returns(next_value, args.use_gae,
                                                 args.gamma, args.tau)

                    with chrono.time('agent.update'):  # 11.147009023304644
                        value_loss, action_loss, dist_entropy = agent.update(
                            rollouts)

                        exp.log_batch_loss(action_loss)
                        exp.log_metric('value_loss', value_loss)

                    with chrono.time('after_update'):
                        rollouts.after_update()

                    total_num_steps = (j +
                                       1) * args.num_processes * args.num_steps

                    if args.eval_interval is not None and len(
                            episode_rewards
                    ) > 1 and j % args.eval_interval == 0:
                        eval_model(**locals())

            # -- number
        # -- chrono
        exp.show_eta(j, t)
    # -- epoch
    exp.report()