Пример #1
0
def main():
    print("config:\n")
    print("activation:", args.activation)
    print("evaluation:", args.evaluation)
    print("evaluation mode:", args.evaluation_mode)
    print("evaluation layer:", args.evaluation_layer)
    writer = SummaryWriter()
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy}, activation = args.activation, modulation = args.evaluation)
    # load trained model
    if args.load_model_path != None:
        state_dicts = torch.load(args.load_model_path)
        actor_critic.load_nets(state_dicts)

    actor_critic.to(device)


    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    # elif args.algo == 'ppo':
    #     agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
    #                      args.value_loss_coef, args.entropy_coef, lr=args.lr,
    #                            eps=args.eps,
    #                            max_grad_norm=args.max_grad_norm)
    # elif args.algo == 'acktr':
    #     agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
    #                            args.entropy_coef, acktr=True)


    tonic_g = 1
    phasic_g = 1
    if args.evaluation and args.evaluation_layer == 1:  # f1 modulation
        tonic_g = args.f1_tonic_g
        phasic_g = args.f1_phasic_g
    if args.evaluation and args.evaluation_layer == 0:  # input activation
        tonic_g = args.input_tonic_g
        phasic_g = args.input_phasic_g

    g = torch.ones(args.num_processes,1)*tonic_g
    g_device = (torch.ones(args.num_processes,1)*tonic_g).to(device)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.observation_space.shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size, tonic_g)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    pre_value = [None for i in range(args.num_processes)]
    evaluations = [0 for i in range(args.num_processes)]
    ## to calculate next_value and update g
    next_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size).to(device)
    next_g = torch.zeros(args.num_processes,1).to(device)
    next_masks = torch.zeros(args.num_processes,1).to(device)
    next_obs = torch.zeros(args.num_processes, *envs.observation_space.shape).to(device)

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.g[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            # calculate next value with old g and decide new g
            if args.evaluation:
                if args.evaluation_layer == 0:
                    next_obs.copy_(neural_activity(obs,g_device))
                else:
                    next_obs.copy_(obs/255)
                next_recurrent_hidden_states.copy_(recurrent_hidden_states)
                next_g.copy_(g)
                next_masks.copy_(masks)
                with torch.no_grad():
                    next_value = actor_critic.get_value(next_obs,
                                                next_g,
                                                next_recurrent_hidden_states,
                                                next_masks).detach()
                evaluations, g, pre_value = calc_modes(reward, next_value, pre_value, evaluations, args.evaluation_mode, tonic_g, phasic_g, masks)
                g_device.copy_(g)

            # observation processing with new g
            if args.evaluation and args.evaluation_layer == 0:
                obs = neural_activity(obs, g_device)
            else:
                obs = obs/255.0

            for idx in range(len(infos)):
                info = infos[idx]
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    steps_done = j*args.num_steps*args.num_processes + step*args.num_processes + idx
                    writer.add_scalar('data/reward', info['episode']['r'], steps_done )
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, g)

            # record evaluation value to help decide parameters to switch modes
            if args.evaluation_log:
                writer.add_scalar('data/evaluations', evaluations[0], j*args.num_steps*args.num_processes + step*args.num_processes)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.g[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()
        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                    pass

            state_dicts = actor_critic.save_nets()
            torch.save(state_dicts, os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

    writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
Пример #2
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Пример #3
0
def main():

    print('Preparing parameters')

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    print('Creating envs: {}'.format(args.env_name))

    envs = test_mp_envs(args.env_name, args.num_processes)

    print('Creating network')
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    print('Initializing PPO')
    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    print('Memory')
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = []

    num_episodes = [0 for _ in range(args.num_processes)]

    last_index = 0

    print('Starting ! ')

    start = time.time()
    for j in tqdm(range(num_updates)):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = actor_critic.act(
                    rollouts.obs[step], rollouts.masks[step])

            obs, reward, done, infos = envs.step(action)

            for info_num, info in enumerate(infos):
                if info_num == 0:
                    if 'episode' in info.keys():
                        episode_rewards.append(info['episode']['r'])
                        # end_episode_to_viz(writer, info, info_num, num_episodes[info_num])
                        num_episodes[info_num] += 1
                        plot_rewards(episode_rewards, args)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            rollouts.insert(obs, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)
        losses = agent.update(rollouts)
        rollouts.after_update()
Пример #4
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    average_actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    average_actor_critic.load_state_dict(actor_critic.state_dict())
    actor_critic.to(device)
    average_actor_critic.to(device)

    agent = algo.ACER_AGENT(actor_critic,
                            average_actor_critic,
                            args.value_loss_coef,
                            args.entropy_coef,
                            args.gamma,
                            args.clip,
                            args.no_trust_region,
                            args.alpha,
                            args.delta,
                            lr=args.lr,
                            eps=args.eps,
                            rms_alpha=args.rms_alpha,
                            max_grad_norm=args.max_grad_norm)

    buffer = Buffer(args.num_steps, args.num_processes,
                    envs.observation_space.shape, envs.action_space,
                    actor_critic.recurrent_hidden_state_size, args.buffer_size)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    off_rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                  envs.observation_space.shape,
                                  envs.action_space,
                                  actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    off_rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    acer = algo.ACER(actor_critic, rollouts, off_rollouts, buffer,
                     episode_rewards, agent, envs)

    start = time.time()
    for j in range(num_updates):
        # On-policy ACER
        value_loss, action_loss, dist_entropy = acer.call(on_policy=True)
        if args.replay_ratio > 0 and buffer.has_atleast(args.replay_start):
            # Off-policy ACER
            n = np.random.poisson(args.replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \nLast {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\ndist_entropy {:.1f}, value/action loss {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            eval_episode_rewards = []

            obs = eval_envs.reset().to(device)
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, _, _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, _, done, infos = eval_envs.step(action)

                obs = obs.to(device)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done]).to(device)
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))
Пример #5
0
def main():
    args = get_args()
    args.num_processes = 16
    args.env_name = 'BreakoutNoFrameskip-v4'

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = A2C_ACKTR(actor_critic,
                          args.value_loss_coef,
                          args.entropy_coef,
                          lr=args.lr,
                          eps=args.eps,
                          alpha=args.alpha,
                          max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))
Пример #6
0
def main():

    print('Preparing parameters')

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    # print('Initializing visdom')
    # if args.vis:
    #     from visdom import Visdom
    #     viz = Visdom(port=args.port)
    #     win = None

    print('Creating envs')
    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    print('Creating network')
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    print('Initializing PPO')
    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)
    print('Memory')
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    # ===================== TB visualisation =================

    writer = SummaryWriter()
    last_index = 0

    print('Starting ! ')

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = actor_critic.act(
                    rollouts.obs[step], rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        writer.add_scalar('Agents metrics/Policy loss', action_loss, j)
        writer.add_scalar('Agents metrics/Value loss', value_loss, j)
        writer.add_scalar('Agents metrics/Entropy loss', dist_entropy, j)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if j % args.vis_interval == 0:
            try:

                # Sometimes monitor doesn't properly flush the outputs
                # win, tx, ty = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames)
                tx, ty = get_reward_log(args.log_dir)
                if tx != None and ty != None:
                    max_index = len(tx)
                    for ind_iter in range(last_index, max_index):
                        writer.add_scalar('Reward', ty[ind_iter], tx[ind_iter])
                    last_index = max_index

                # tx, ty = get_reward_log(viz, win, args.log_dir, args.env_name,
                #                   args.algo, args.num_frames)

                # if tx != None and ty != None:
                #     plt.cla()
                #     plt.plot(tx,ty)
                #     plt.pause(0.1)

                #     plt.show()

                # if(ty != None and tx != None):

                #     input(ty)
                #     writer.add_scalar('Reward', ty[-1], tx[-1])
                # if(tx != None and ty != None):
                #     plt.cla()
                #     plt.plot(tx, ty)
                #     plt.pause(0.1)
            except IOError:
                pass
Пример #7
0
def main(args):
    env = GymEnvironment(args, gamma)
    env.env = env.env.unwrapped

    actor_critic = Policy(obs_shape,
                          env.action_size,
                          base_kwargs={'recurrent': False})
    actor_critic.load_state_dict(torch.load('log/model.pt'))
    actor_critic.to(device)

    agent = PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch,
                value_loss_coef, entropy_coef, lr, eps, max_grad_norm)
    rollouts = RolloutStorage(num_steps, num_processes, obs_shape,
                              env.action_space,
                              actor_critic.recurrent_hidden_state_size)
    current_obs = torch.zeros(num_processes, *obs_shape)

    obs, _, _, _ = env.new_expt()
    obs = obs[np.newaxis, ...]

    current_obs[:, -1] = torch.from_numpy(obs)
    rollouts.obs[0].copy_(current_obs)

    current_obs = current_obs.to(device)
    rollouts.to(device)

    num_updates = math.ceil(args.max_timesteps / (num_processes * num_steps))
    n_goal_reached = 0
    n_episodes = 0
    for j in range(num_updates):
        for step in range(num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            cpu_actions = action.squeeze(1).cpu().numpy()

            (obs, reward, done), goal_reached = env.act(action)
            reward = torch.from_numpy(np.expand_dims(np.stack([reward]),
                                                     1)).float()

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in [done]])

            masks = masks.to(device)

            current_obs[:, :-1] = current_obs[:, 1:]
            if done:
                current_obs[:] = 0
            current_obs[:, -1] = torch.from_numpy(obs)
            rollouts.insert(current_obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

            if done:
                n_episodes += 1
                env.new_expt()
                if goal_reached:
                    n_goal_reached += 1

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                rollouts.masks[step]).detach()

        rollouts.compute_returns(next_value, use_gae, gamma, tau, step)
        value_loss, action_loss, dist_entropy = agent.update(rollouts, step)
        rollouts.after_update()

        if j % log_interval == 0:
            total_num_steps = (j + 1) * num_processes * num_steps

            try:
                success = float(n_goal_reached) / n_episodes
            except ZeroDivisionError:
                success = 0.
            print(
                "Timesteps: {}, Goal reached : {} / {}, Success %: {}".format(
                    total_num_steps, n_goal_reached, n_episodes, success))

    if args.lang_coeff > 0:
        av_list = np.array(env.action_vectors_list)
        for k in range(len(spearman_corr_coeff_actions)):
            sr, _ = spearmanr(env.rewards_list, av_list[:, k])
            print(k, sr)
Пример #8
0
def setup(model_setting, algorithm, device, _run, _log, log, seed, cuda):
    """
    All args are automatically provided by sacred
    Some of the important objects created in this function are:
        - parallel environments (using SubprocVecEnv from OpenAI baselines)
        - instance of model (BMIL)
        - experience replay
        - RolloutStorage: a helper class to save rewards and compute the advantage loss
    """

    # Create working dir
    id_tmp_dir = "{}/{}/".format(log['tmp_dir'], _run._id)
    helpers.safe_make_dirs(id_tmp_dir)

    np.set_printoptions(precision=2)

    torch.manual_seed(seed)
    np.random.seed(seed)
    if cuda:
        torch.cuda.manual_seed(seed)

    logger = logging.getLogger()
    if _run.debug or _run.pdb:
        logger.setLevel(logging.DEBUG)

    envs = register_and_create_envs(id_tmp_dir)
    model = create_model(envs)

    # Experience replay buffer to store off-policy data.
    replay = ExpReplay(batch_size=algorithm['num_processes_offPol'],
                       max_trajs=1000,
                       fwd_jump=algorithm['forward_jump'],
                       bwd_jump=algorithm['backward_jump'])

    rollouts = RolloutStorage(algorithm['num_steps'],
                              algorithm['num_processes'])
    rollouts.to(device)

    # Reset all environments
    obs = envs.reset()
    curr_ob = torch.from_numpy(obs).float()

    init_state = torch.zeros(algorithm['num_processes'],
                             model_setting['belief_dim']).to(device)
    init_state_offPol = torch.zeros(algorithm['num_processes_offPol'],
                                    model_setting['belief_dim']).to(device)
    init_episode_reward_info = torch.zeros([algorithm['num_processes'], 1])
    init_ac = torch.zeros(algorithm['num_processes'],
                          envs.action_space.shape[0]).to(device)

    # Buffer to hold information along the current "on-policy" path.
    curr_memory = {
        'curr_ob': curr_ob,  # o_t
        'prev_belief': init_state,  # b_{t-1}
        'prev_ac': init_ac,  # a_{t-1}
        'prev_ob': curr_ob.clone(),  # o_{t-1}
        'expert_ac': init_ac.clone(),
        'episode_reward_info': init_episode_reward_info
    }

    # Buffer to hold information along the current "off-policy" path.
    curr_memory_offPol = {
        'curr_ob': None,
        'prev_ob': None,
        'prev_belief': init_state_offPol,
        'prev_ac': None,
        'ob_tpk': None,  # o_{t+k}
        'ob_tmkm1': None,  # o_{t-k-1}
        'future_k_acs': None,  # a_t:a_{t+k-1}
        'past_k_acs': None,  # a_{t-k-1}:a_{t-2}
        'future_mask': None,  # mask for o_{t+k}
        'past_mask': None  # mask for o_{t-k-1}
    }

    return envs, model, rollouts, curr_memory, curr_memory_offPol, replay
Пример #9
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
                for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs, gamma=args.gamma)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = Policy(obs_shape, envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
        envs.action_space, actor_critic.recurrent_hidden_state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    obs = envs.reset()
    update_current_obs(obs, current_obs, obs_shape, args.num_stack)
    rollouts.obs[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    current_obs = current_obs.to(device)
    rollouts.to(device)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            cpu_actions = action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            masks = masks.to(device)

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs, current_obs, obs_shape, args.num_stack)
            rollouts.insert(current_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), dist_entropy,
                       value_loss, action_loss))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Пример #10
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    utils.cleanup_log_dir(log_dir)

    with open(log_dir + 'extras.csv', "w") as file:
        file.write("n, value_loss\n")

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False)

    model = Policy(envs.observation_space.shape,
                   envs.action_space.n,
                   extra_kwargs={'use_backpack': args.algo == 'tdprop'})
    model.to(device)

    if args.algo == 'tdprop':
        from algo.sarsa_tdprop import SARSA
        agent = SARSA(model,
                      lr=args.lr,
                      eps=args.eps,
                      max_grad_norm=args.max_grad_norm,
                      beta_1=args.beta_1,
                      beta_2=args.beta_2,
                      n=args.num_steps,
                      num_processes=args.num_processes,
                      gamma=args.gamma)
    else:
        from algo.sarsa import SARSA
        agent = SARSA(model,
                      lr=args.lr,
                      eps=args.eps,
                      max_grad_norm=args.max_grad_norm,
                      beta_1=args.beta_1,
                      beta_2=args.beta_2,
                      algo=args.algo)

    explore_policy = utils.eps_greedy
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              model.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                qs = model(rollouts.obs[step])
                _, dist = explore_policy(qs, args.exploration)
                actions = dist.sample().unsqueeze(-1)
                value = qs.gather(-1, actions)

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(actions)
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, torch.FloatTensor([0.0]), actions, value,
                            value, reward, masks, bad_masks)
        with torch.no_grad():
            next_qs = model(rollouts.obs[-1])
            next_probs, _ = explore_policy(next_qs, args.exploration)
            next_value = (next_probs * next_qs).sum(-1).unsqueeze(-1)

        rollouts.compute_returns(next_value, args.gamma)

        value_loss = agent.update(rollouts, explore_policy, args.exploration)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0 or j == num_updates - 1):
            save_path = os.path.join(args.log_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            torch.save([
                list(model.parameters()),
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                    ("Updates {}, num timesteps {}, FPS {}\n" + \
                            "Last {} training episodes: mean/median reward {:.1f}/{:.1f}" + \
                            ", min/max reward {:.1f}/{:.1f}\n" + \
                            "entropy {:.2f}, value loss {:.4f}")
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist.entropy().mean().item(), value_loss))
            with open(log_dir + 'extras.csv', "a") as file:
                file.write(
                    str(total_num_steps) + ", " + str(value_loss) + "\n")
Пример #11
0
def main():
    import matplotlib.pyplot as plt

    # You probably won't need this if you're embedding things in a tkinter plot...
    plt.ion()
    x = np.linspace(0, 6 * np.pi, 100)
    y = np.sin(x)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    import time
    line1, = ax.plot([0, 1, 2], [0, 1, 1],
                     'r-')  # Returns a tuple of line objects, thus the comma
    time.sleep(0.01)

    torch.set_num_threads(1)
    args.num_processes = 1
    # device = torch.device("cuda:0" if args.cuda else "cpu")
    device = torch.device("cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = TorchRunner(acc=0.005)
    ob_shape = envs.reset().shape
    # envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
    #                     args.gamma, args.log_dir, args.add_timestep, device, False)
    #
    actor_critic = Policy(ob_shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})

    # # try to load the previous policy
    # data = torch.load(
    #     r"C:\Users\clvco\URA_F18\pytorch-a2c-ppo-acktr\trained_models\ppo\weight_positiverev_test.pt")
    # # # print(data)
    # actor_critic.load_state_dict(data[0].state_dict())
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)
    obs = envs.reset()
    ob_shape = obs.shape
    rollouts = RolloutStorage(args.num_steps, args.num_processes, ob_shape,
                              envs.action_space,
                              (agent.actor_critic.base.output_size), (1),
                              actor_critic.recurrent_hidden_state_size)
    print(args.num_processes)
    print(envs.observation_space.shape)
    print(obs.shape)
    print(rollouts.obs[0].shape)
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = list()
    ep_reward = 0
    import tqdm
    start = time.time()
    print(args)
    print(int(args.num_frames) // args.num_steps // args.num_processes)
    print('NUM', num_updates)
    timestep = 0
    ep_ends = []
    for j in range(num_updates):
        if j == 0:
            print("UPDATING SYNERGY")
            actor_critic.adjust_synergy(0.0)
        for step in tqdm.tqdm(range(args.num_steps)):
            # Sample actions
            timestep += 1
            with torch.no_grad():
                value, action, synergy, q, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            ep_reward += reward[0]
            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            if done[0]:
                obs = envs.reset()
                episode_rewards.append(ep_reward)
                ep_ends.append(timestep)
                ep_reward = 0
            # print(action)
            rollouts.insert(obs, recurrent_hidden_states, action, synergy, q,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model]
            print("Saving model")
            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))
            print("Saved model to: ",
                  os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        print("update time", print(len(episode_rewards)))
        if True:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.5f}/{:.5f}, min/max reward {:.5f}/{:.5f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards[-10:]),
                        np.median(episode_rewards[-10:]),
                        np.min(episode_rewards[-10:]),
                        np.max(episode_rewards[-10:]), dist_entropy,
                        value_loss, action_loss))

            import time
            ydata = np.convolve(episode_rewards,
                                np.ones(10) / 10,
                                mode='valid')
            line1.set_xdata(np.arange(0, len(ydata)))
            line1.set_ydata(ydata)
            ax.set_xlim(0, len(ydata))
            ax.set_ylim(min(ydata), max(ydata))
            fig.canvas.draw()
            fig.canvas.flush_events()
            time.sleep(0.01)
            # save the returns
            xdata = np.array(ep_ends)
            ret_dir = 'returns_weight_experiments'
            os.makedirs(ret_dir, exist_ok=True)
            ret_path = ret_dir + '/' + args.env_name + '_' + str(
                args.seed) + '.npy'
            ep_path = ret_dir + '/' + "x_data-" + args.env_name + '_' + str(
                args.seed) + '.npy'
            np.save(ret_path, np.array(np.array(episode_rewards)))
            np.save(ep_path, ep_ends)
Пример #12
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, 1, args.gammas[-1], None,
                         args.add_timestep, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy,
                              'num_values': args.num_values,
                              'sum_values': args.sum_values
                          })

    state_dict = torch.load(args.log_dir + '/ppo/' + args.env_name + '.pt')
    actor_critic.load_state_dict(state_dict[0].state_dict())
    actor_critic.to(device)

    rollouts = RolloutStorage(1,
                              1,
                              envs.observation_space.shape,
                              envs.action_space,
                              actor_critic.recurrent_hidden_state_size,
                              tau=args.tau,
                              gammas=args.gammas,
                              use_delta_gamma=args.use_delta_gamma,
                              use_capped_bias=args.use_capped_bias)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = []
    values = []
    rewards = []
    for num_no_ops in range(30):
        really_done = False
        cur_step = 0
        while not really_done:
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    obs, rollouts.recurrent_hidden_states[0],
                    rollouts.masks[0])

            if cur_step <= num_no_ops:
                obs, reward, done, infos = envs.step(torch.zeros((1, 1)))
            else:
                # Sample actions

                # Obser reward and next obs
                obs, reward, done, infos = envs.step(action)

            if num_no_ops == 0:
                if device == 'cpu':
                    rewards.append(reward.numpy())
                    values.append(value.numpy())
                else:
                    rewards.append(reward.cpu().numpy())
                    values.append(value.cpu().numpy())

            if 'episode' in infos[0].keys():
                really_done = True
                episode_rewards.append(infos[0]['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)
            cur_step += 1

    with open(args.log_dir + '/random_rewards.pkl', 'wb') as handle:
        pickle.dump(episode_rewards, handle)

    with open(args.log_dir + '/values_timestep.pkl', 'wb') as handle:
        pickle.dump(values, handle)

    with open(args.log_dir + '/rewards_timestep.pkl', 'wb') as handle:
        pickle.dump(rewards, handle)
Пример #13
0
def main(args):
    env = GymEnvironment(args, gamma)
    env.env = env.env.unwrapped

    actor_critic = Policy(obs_shape,
                          env.action_size,
                          base_kwargs={'recurrent': False})
    actor_critic.to(device)

    agent = PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch,
                value_loss_coef, entropy_coef, lr, eps, max_grad_norm)
    rollouts = RolloutStorage(num_steps, num_processes, obs_shape,
                              env.action_space,
                              actor_critic.recurrent_hidden_state_size)
    current_obs = torch.zeros(num_processes, *obs_shape)

    obs, _, _, _ = env.new_expt()
    obs = obs[np.newaxis, ...]

    current_obs[:, -1] = torch.from_numpy(obs)
    rollouts.obs[0].copy_(current_obs)

    current_obs = current_obs.to(device)
    rollouts.to(device)

    num_updates = math.ceil(args.max_timesteps / (num_processes * num_steps))
    n_goal_reached = 0
    n_episodes = 0
    for j in tqdm(range(num_updates), ascii=True):
        for step in range(num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            cpu_actions = action.squeeze(1).cpu().numpy()

            (obs, reward, done), goal_reached = env.act(action)
            reward = torch.from_numpy(np.expand_dims(np.stack([reward]),
                                                     1)).float()

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in [done]])

            masks = masks.to(device)

            current_obs[:, :-1] = current_obs[:, 1:]
            if done:
                current_obs[:] = 0
            current_obs[:, -1] = torch.from_numpy(obs)
            rollouts.insert(current_obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

            if done:
                n_episodes += 1
                env.new_expt()
                if goal_reached:
                    n_goal_reached += 1

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                rollouts.masks[step]).detach()

        rollouts.compute_returns(next_value, use_gae, gamma, tau, step)
        value_loss, action_loss, dist_entropy = agent.update(rollouts, step)
        rollouts.after_update()

        torch.save(agent.actor_critic.state_dict(), 'log/model.pt')
def main():
    '''
    Train PPO policies on each of the training environments.
    '''
    args = get_args()

    try:
        os.makedirs(args.log_dir)
    except OSError:
        pass

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args, device)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': False})
    actor_critic.to(device)

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    ep_reward = np.zeros(args.num_processes)
    episode_rewards = deque(maxlen=100)
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):
        # decrease learning rate linearly
        utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obs reward and next obs
            obs, reward, done, infos = envs.step(action)
            if 'spaceship' in args.env_name:  # spaceship, swimmer
                for i in range(len(done)):
                    if done[i]:
                        episode_rewards.append(reward[i].item())
            # elif 'swimmer' in args.env_name:
            else:
                for i in range(len(done)):
                    ep_reward[i] += reward[i].numpy().item()
                    if done[i]:
                        episode_rewards.append(ep_reward[i])
                        ep_reward[i] = 0
            # if 'ant' in args.env_name:
            #     for info in infos:
            #         if 'episode' in info.keys():
            #             episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, True, args.gamma, args.gae_lambda,
                                 True)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            try:
                os.makedirs(args.save_dir)
            except OSError:
                pass


            torch.save(
                actor_critic.state_dict(),
                os.path.join(args.save_dir, "ppo.{}.env{}.seed{}.pt"\
                    .format(args.env_name, args.default_ind, args.seed))
            )

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("\nUpdates {}, num timesteps {}, Last {} training episodes: \
                \n mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}"
                  .format(j, total_num_steps, len(episode_rewards),
                          np.mean(episode_rewards), np.median(episode_rewards),
                          np.min(episode_rewards), np.max(episode_rewards)))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, device)

    envs.close()
Пример #15
0
                               max_grad_norm=parameters['max_grad_norm'],
                               use_adam=parameters['use_adam'])
else:
    agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)

rollouts = RolloutStorage(parameters['num_steps'], parameters['num_processes'],
                    envs.observation_space.shape, envs.action_space,
                    actor_critic.recurrent_hidden_state_size)


obs = envs.reset()
rollouts.obs[0].copy_(obs)
rollouts.to(device)

recent_count = 50
episode_rewards = deque(maxlen=recent_count)
episode_lengths = deque(maxlen=recent_count)

if args.continue_training:
    progress = json.load(open(progress_save))
    num_updates_init = progress["last_saved_num_updates"] 
    actor_critic.load_state_dict(torch.load(MODEL_SAVE_PATH))
else:
    num_updates_init = 0
    progress = {
        "last_saved_num_updates": 0
    }
Пример #16
0
def main():
    device = 'cpu'
    acc_steps = []
    acc_scores = []
    torch.set_num_threads(1)
    print('here')

    if args.env_name == 'Reacher-v2':
        rbf1 = build_features_reacher2(.2, 5, 2)
        len_rbf = rbf1._K
        len_features = len_rbf + 1
    if args.env_name == 'Hopper-v2':
        len_features = 3
    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space)

    actor_critic.to(device)

    agent = PPO(actor_critic,
                args.clip_param,
                args.ppo_epoch,
                args.num_mini_batch,
                args.value_loss_coef,
                args.entropy_coef,
                lr=args.lr,
                eps=args.eps,
                max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              len_features)
    print('here2')
    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    episode_rewards = collections.deque(maxlen=10)
    num_updates = 20
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            update_linear_schedule(agent.optimizer, j, num_updates, args.lr)
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        # Prepare demos
        demo_actions = np.zeros(
            (1, args.num_processes, envs.action_space.shape[0]))
        demo_states = np.zeros(
            (1, args.num_processes, envs.observation_space.shape[0]))

        demo_features = np.zeros((1, args.num_processes, len_features))
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = actor_critic.act(
                    rollouts.obs[step], rollouts.masks[step])

            # obs, reward and next obs
            demo_actions = np.concatenate(
                [demo_actions,
                 action.reshape(1, args.num_processes, -1)], 0)
            demo_states = np.concatenate([
                demo_states, rollouts.obs[step].reshape(
                    1, args.num_processes, -1)
            ], 0)
            feat_rewards = np.zeros((args.num_processes, len_features))
            if args.env_name == 'Hopper-v2':
                if args.num_processes > 1:
                    pos_before = envs.get_sim_data()
            obs, reward, done, infos = envs.step(action)
            if args.env_name == 'Hopper-v2':
                if args.num_processes > 1:
                    pos_after = envs.get_sim_data()
                    for num_p in range(args.num_processes):
                        feat_1 = pos_after[num_p] - pos_before[num_p]
                        feat_2 = 0
                        if not done[num_p]:
                            feat_2 = 1
                        # feat_2 = np.array([1 for _ in range(args.num_processes)])
                        feat_3 = np.array(
                            [np.linalg.norm(action[num_p],
                                            ord=2)**2]).flatten()
                        feat_rewards[num_p] = np.array(
                            [feat_1, feat_2, feat_3])
            if args.env_name == 'Reacher-v2':
                if args.num_processes > 1:
                    body_data = envs.get_body_data()
                    for num_p in range(args.num_processes):
                        rbf1_ = rbf1(body_data[num_p][:-1])
                        rbf4_ = np.array(
                            [np.linalg.norm(action[num_p], ord=2)**2])
                        feat_rewards[num_p] = np.concatenate(
                            (rbf1_.reshape(-1), rbf4_))
                else:
                    rbf1_ = rbf1(
                        (envs.envs[0].env.env.get_body_com("fingertip") -
                         envs.envs[0].env.env.get_body_com("target"))[:-1])
                    rbf4_ = np.array([-np.square(action[0]).sum()])
                    feat_rewards[0] = np.concatenate(
                        (rbf1_.reshape(-1), rbf4_))
            demo_features = np.concatenate([
                demo_features,
                feat_rewards.reshape(1, args.num_processes, -1)
            ], 0)
            if step > 1 and step % 1000 == 0:
                done = [True for _ in range(args.num_processes)]

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, action, action_log_prob, \
                            value, reward, masks, feat_rewards)

        # Save demos:
        action_file_name = demos_expe_dir + '/actions_step_' + str(j) + '.npy'
        state_file_name = demos_expe_dir + '/states_step_' + str(j) + '.npy'
        rew_feat_file_name = demos_expe_dir + '/rew_feat_step_' + str(
            j) + '.npy'
        policy_file_name = demos_expe_dir + '/policy_step_' + str(j) + '.pth'
        np.save(action_file_name, demo_actions)
        np.save(state_file_name, demo_states)
        np.save(rew_feat_file_name, demo_features)
        torch.save(actor_critic.state_dict(), policy_file_name)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir:
            save_path = os.path.join(args.save_dir, 'ppo')
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + '.pt'))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            print('Updates', j, 'num timesteps', len(episode_rewards),
                  '\n Last training episodes: mean/median reward',
                  '{:.1f}'.format(np.mean(episode_rewards)),
                  '/{:.1f}'.format(np.median(episode_rewards)),
                  'min/max reward', '{:.1f}'.format(np.min(episode_rewards)),
                  '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy',
                  dist_entropy, 'value loss', value_loss, 'action loss',
                  action_loss)

        if len(episode_rewards) > 1:
            acc_steps.append(total_num_steps)
            acc_scores.append(np.mean(episode_rewards))
            #print(acc_scores)

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _ = actor_critic.act(obs,
                                                    eval_masks,
                                                    deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print('Evaluation using', len(eval_episode_rewards),
                  'episodes: mean reward',
                  '{:.5f}\n'.format(np.mean(eval_episode_rewards)))

    scores_file_name = args.scores_dir + '/learner_scores_' + args.env_name + '_' + args.expe + '.npy'
    steps_file_name = args.scores_dir + '/learner_steps_' + args.env_name + '_' + args.expe + '.npy'
    np.save(scores_file_name, np.array(acc_scores))
    np.save(steps_file_name, np.array(acc_steps))
def main():
  device = 'cpu'
  acc_steps = []
  acc_scores = []
  torch.set_num_threads(1)

  envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                       args.gamma, args.log_dir, args.add_timestep,
                       device, False)

  # get cloned policy and recovered reward function
  policy_reward_dir = args.rewards_dir
  policy_dir = args.policies_dir

  policy_reward = Policy(envs.observation_space.shape, envs.action_space)

  policy_reward_file_name = policy_reward_dir + '/reward_' + args.expe + '.pth'
  policy_reward_sd = torch.load(policy_reward_file_name)
  policy_reward.load_state_dict(policy_reward_sd)

  actor_critic = Policy(envs.observation_space.shape, envs.action_space)

  policy_file_name = policy_dir + '/last_policy_' + args.expe + '.pth'
  policy_sd = torch.load(policy_file_name)
  actor_critic.load_state_dict(policy_sd)
  actor_critic.to(device)

  agent = PPO(actor_critic, args.clip_param, args.ppo_epoch,
              args.num_mini_batch, args.value_loss_coef, args.entropy_coef,
              lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm)

  rollouts = RolloutStorage(args.num_steps, args.num_processes,
                            envs.observation_space.shape, envs.action_space)

  obs = envs.reset()
  rollouts.obs[0].copy_(obs)
  rollouts.to(device)

  episode_rewards = collections.deque(maxlen=10)

  for j in range(num_updates):

    if args.use_linear_lr_decay:
      # decrease learning rate linearly
      update_linear_schedule(agent.optimizer, j, num_updates, args.lr)
      agent.clip_param = args.clip_param  * (1 - j / float(num_updates))

    for step in range(args.num_steps):
      # Sample actions
      with torch.no_grad():
        value, action, action_log_prob = actor_critic.act(
            rollouts.obs[step],
            rollouts.masks[step])

      obs, _, done, infos = envs.step(action)
      if step > 1 and step % 1000 == 0:
        done = True

      # use infered reward:
      with torch.no_grad():
        # _, reward = shapes(rollouts.obs[step], 0)
        _, action_log_probs, _, _ = policy_reward.evaluate_actions(
            rollouts.obs[step], None, None, action)
        reward = action_log_probs

      for info in infos:
        # if 'episode' in info.keys():
        #  episode_rewards.append(info['episode']['r'])
        r = 0
        for key, val in info.items():
          if 'reward' in key:
            r += val
        episode_rewards.append(r)

      # If done then clean the history of observations.
      masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                 for done_ in done])

      rollouts.insert(obs, action, action_log_prob,
                      value, reward, masks)

    with torch.no_grad():
      next_value = actor_critic.get_value(rollouts.obs[-1],
                                          rollouts.masks[-1]).detach()

    rollouts.compute_returns(next_value, args.gamma, args.tau)

    value_loss, action_loss, dist_entropy = agent.update(rollouts)

    rollouts.after_update()

    # save for every interval-th episode or for the last epoch
    if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir:
      save_path = os.path.join(args.save_dir, 'ppo')
      try:
        os.makedirs(save_path)
      except OSError:
        pass

      # A really ugly way to save a model to CPU
      save_model = actor_critic

      save_model = [save_model,
                    getattr(get_vec_normalize(envs), 'ob_rms', None)]

      torch.save(save_model, os.path.join(save_path, args.env_name + '.pt'))

    total_num_steps = (j + 1) * args.num_processes * args.num_steps

    if j % args.log_interval == 0 and len(episode_rewards) > 1:
      print('Updates', j,
            'num timesteps', len(episode_rewards),
            '\n Last training episodes: mean/median reward',
            '{:.1f}'.format(np.mean(episode_rewards)),
            '/{:.1f}'.format(np.median(episode_rewards)),
            'min/max reward',
            '{:.1f}'.format(np.min(episode_rewards)),
            '/{:.1f}'.format(np.max(episode_rewards)),
            'dist entropy', dist_entropy,
            'value loss', value_loss,
            'action loss', action_loss)

    if len(episode_rewards) > 1:
      acc_steps.append(total_num_steps)
      acc_scores.append(np.mean(episode_rewards))

    if (args.eval_interval is not None
        and len(episode_rewards) > 1
        and j % args.eval_interval == 0):
      eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes,
                                args.num_processes, args.gamma, eval_log_dir,
                                args.add_timestep, device, True)

      vec_norm = get_vec_normalize(eval_envs)
      if vec_norm is not None:
        vec_norm.eval()
        vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

      eval_episode_rewards = []

      obs = eval_envs.reset()
      eval_masks = torch.zeros(args.num_processes, 1, device=device)

      while len(eval_episode_rewards) < 10:
        with torch.no_grad():
          _, action, _ = actor_critic.act(
              obs, eval_masks, deterministic=True)

        # Obser reward and next obs
        obs, reward, done, infos = eval_envs.step(action)

        eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                        for done_ in done])
        for info in infos:
          if 'episode' in info.keys():
            eval_episode_rewards.append(info['episode']['r'])

      eval_envs.close()

      print('Evaluation using',
            len(eval_episode_rewards),
            'episodes: mean reward',
            '{:.5f}\n'.format(np.mean(eval_episode_rewards)))

  scores_file_name = args.scores_dir + '/learner_scores_' + args.expe + '.npy'
  steps_file_name = args.scores_dir + '/learner_steps_' + args.expe + '.npy'
  np.save(scores_file_name, np.array(acc_scores))
  np.save(steps_file_name, np.array(acc_steps))
Пример #18
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:1" if args.cuda else "cpu")

    ##
    UID = 'exp_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    step_log = []
    reward_log = []

    ## To be used to selec environment
    mode = 'normal'

    # encoder type
    encoder = 'sym_VAE'
    if encoder == 'symbolic':
        embedding_size = (18, )
    elif encoder == 'AE':
        embedding_size = (200, )
    elif encoder == 'VAE':
        embedding_size = (100, )
    elif encoder == 'sym_VAE':
        embedding_size = (118, )
    else:
        raise NotImplementedError('fff')

    # load pre-trained AE
    #AE = VAEU([128,128])
    #model_path = '/hdd_c/data/miniWorld/trained_models/VAE/dataset_4/VAEU.pth'
    #AE = torch.load(model_path)
    #AE.eval()

    # load pre-trained VAE
    VAE = VAER([128, 128])
    model_path = '/hdd_c/data/miniWorld/trained_models/VAE/dataset_5/VAER.pth'
    VAE = torch.load(model_path).to(device)
    VAE.eval()

    # load pre-trained detector
    Detector_model = Detector
    model_path = '/hdd_c/data/miniWorld/trained_models/Detector/dataset_5/Detector_resnet18_e14.pth'
    Detector_model = torch.load(model_path).to(device)

    # load pre-trained RNN
    RNN_model = RNN(200, 128)
    model_path = '/hdd_c/data/miniWorld/trained_models/RNN/RNN1.pth'
    RNN_model = torch.load(model_path).to(device)
    RNN_model.eval()
    """
    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
    """

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    print(envs.observation_space.shape)

    #actor_critic = Policy(envs.observation_space.shape, envs.action_space,
    #    base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic = Policy(embedding_size,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    #rollouts = RolloutStorage(args.num_steps, args.num_processes,
    #                    envs.observation_space.shape, envs.action_space,
    #                    actor_critic.recurrent_hidden_state_size)
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              embedding_size, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    #print(obs.size())
    #obs = make_var(obs)
    print(obs.size())
    with torch.no_grad():
        if encoder == 'symbolic':

            z = Detector_model(obs)
            print(z.size())
            z = Detector_to_symbolic(z)
            rollouts.obs[0].copy_(z)
        elif encoder == 'AE':
            z = AE.encode(obs)
            rollouts.obs[0].copy_(z)
        elif encoder == 'VAE':
            z = VAE.encode(obs)[0]
            rollouts.obs[0].copy_(z)
        elif encoder == 'sym_VAE':
            z_vae = VAE.encode(obs)[0]
            z_sym = Detector_model(obs)
            z_sym = Detector_to_symbolic(z_sym)
            z = torch.cat((z_vae, z_sym), dim=1)
            rollouts.obs[0].copy_(z)
        else:
            raise NotImplementedError('fff')

    #rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=100)

    start = time.time()
    for j in range(num_updates):
        #print(j)
        for step in range(args.num_steps):
            # Sample actions
            #print(step)
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            #print(action)
            with torch.no_grad():
                obs, reward, done, infos = envs.step(action)
                if encoder == 'symbolic':
                    #print(obs.size())
                    np.save(
                        '/hdd_c/data/miniWorld/training_obs_{}.npy'.format(
                            step),
                        obs.detach().cpu().numpy())
                    z = Detector_model(obs / 255.0)
                    z = Detector_to_symbolic(z)
                    #print(z)
                    np.save(
                        '/hdd_c/data/miniWorld/training_z_{}.npy'.format(step),
                        z.detach().cpu().numpy())
                elif encoder == 'AE':
                    z = AE.encode(obs)
                elif encoder == 'VAE':
                    z = VAE.encode(obs)[0]
                elif encoder == 'sym_VAE':
                    z_vae = VAE.encode(obs)[0]
                    z_sym = Detector_model(obs)
                    z_sym = Detector_to_symbolic(z_sym)
                    z = torch.cat((z_vae, z_sym), dim=1)
                else:
                    raise NotImplementedError('fff')
                #obs = make_var(obs)
            """
            for info in infos:
                if 'episode' in info.keys():
                    print(reward)
                    episode_rewards.append(info['episode']['r'])
            """

            #             # FIXME: works only for environments with sparse rewards
            #             for idx, eps_done in enumerate(done):
            #                 if eps_done:
            #                     episode_rewards.append(reward[idx])

            # FIXME: works only for environments with sparse rewards
            for idx, eps_done in enumerate(done):
                if eps_done:
                    #print('done')
                    episode_rewards.append(infos[idx]['accumulated_reward'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            #rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)
            rollouts.insert(z, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            print('Saving model')
            print()

            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        #print(len(episode_rewards))

        step_log.append(total_num_steps)
        reward_log.append(np.mean(episode_rewards))
        step_log_np = np.asarray(step_log)
        reward_log_np = np.asarray(reward_log)
        np.savez_compressed('/hdd_c/data/miniWorld/log/{}.npz'.format(UID),
                            step=step_log_np,
                            reward=reward_log_np)

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n"
                .format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    len(episode_rewards), np.mean(episode_rewards),
                    np.median(episode_rewards), np.min(episode_rewards),
                    np.max(episode_rewards),
                    np.count_nonzero(np.greater(episode_rewards, 0)) /
                    len(episode_rewards)))

        if args.eval_interval is not None and len(
                episode_rewards) > 1 and j % args.eval_interval == 0:
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            if eval_envs.venv.__class__.__name__ == "VecNormalize":
                eval_envs.venv.ob_rms = envs.venv.ob_rms

                # An ugly hack to remove updates
                def _obfilt(self, obs):
                    if self.ob_rms:
                        obs = np.clip((obs - self.ob_rms.mean) /
                                      np.sqrt(self.ob_rms.var + self.epsilon),
                                      -self.clipob, self.clipob)
                        return obs
                    else:
                        return obs

                eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv)

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))
        """
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
        """
    envs.close()
Пример #19
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, device, False,
                         args.custom_gym)

    base = SEVN

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'ppo':
        agent = PPO(actor_critic,
                    args.clip_param,
                    args.ppo_epoch,
                    args.num_mini_batch,
                    args.value_loss_coef,
                    args.entropy_coef,
                    lr=args.lr,
                    eps=args.eps,
                    max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    episode_length = deque(maxlen=10)
    episode_success_rate = deque(maxlen=100)
    episode_total = 0

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(agent.optimizer, j, num_updates,
                                         args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    episode_length.append(info['episode']['l'])
                    episode_success_rate.append(
                        info['was_successful_trajectory'])
                    episode_total += 1

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            writer.add_scalars('Train/Episode Reward', {
                "Reward Mean": np.mean(episode_rewards),
                "Reward Min": np.min(episode_rewards),
                "Reward Max": np.max(episode_rewards)
            },
                               global_step=total_num_steps)
            writer.add_scalars('Train/Episode Length', {
                "Episode Length Mean": np.mean(episode_length),
                "Episode Length Min": np.min(episode_length),
                "Episode Length Max": np.max(episode_length)
            },
                               global_step=total_num_steps)
            writer.add_scalar("Train/Episode Reward Mean",
                              np.mean(episode_rewards),
                              global_step=total_num_steps)
            writer.add_scalar("Train/Episode Length Mean",
                              np.mean(episode_length),
                              global_step=total_num_steps)
            writer.add_scalar("Train/Episode Success Rate",
                              np.mean(episode_success_rate),
                              global_step=total_num_steps)

            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Пример #20
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    print('args.lr')
    print(args.lr)

    #     print('args.stat_decay')
    #     print(args.stat_decay)

    #     sys.exit()

    if args.algo == 'a2c':

        #         print('args.eps')
        #         print(args.eps)

        #         sys.exit()

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo in ['acktr']:
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               acktr=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['acktr-h**o']:
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               acktr=True,
                               if_homo=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['acktr-h**o-noEigen']:
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               acktr=True,
                               if_homo=True,
                               stat_decay=args.stat_decay,
                               if_eigen=False)
    elif args.algo in ['kbfgs']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['kbfgs-h**o']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['kbfgs-h**o-invertA']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               stat_decay=args.stat_decay,
                               if_invert_A=True)

    elif args.algo in ['kbfgs-h**o-invertA-decoupledDecay']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               stat_decay_A=args.stat_decay_A,
                               stat_decay_G=args.stat_decay_G,
                               if_invert_A=True,
                               if_decoupled_decay=True)
    elif args.algo in ['kbfgs-h**o-momentumGrad']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               if_momentumGrad=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['kbfgs-h**o-noClip']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               if_clip=False,
                               stat_decay=args.stat_decay)
    else:
        print('unknown args.algo for ' + args.algo)
        sys.exit()

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    record_rewards = []

    record_num_steps = []

    print('num_updates')
    print(num_updates)

    total_num_steps = 0

    start = time.time()
    for j in range(num_updates):

        print('j')
        print(j)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:

                #                 print('info.keys()')
                #                 print(info.keys())

                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

                    print('info[episode][r]')
                    print(info['episode']['r'])

                    record_rewards.append(info['episode']['r'])

                    #                     print('total_num_steps')
                    #                     print(total_num_steps)

                    #                     print('total_num_steps + (step + 1) * args.num_processes')
                    #                     print(total_num_steps + (step + 1) * args.num_processes)

                    record_num_steps.append(total_num_steps +
                                            (step + 1) * args.num_processes)

#                     sys.exit()

# If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy, update_signal = agent.update(
            rollouts)

        if update_signal == -1:
            #             sys.exit()
            break

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass

    print('record_rewards')
    print(record_rewards)

    dir_with_params = args.env_name + '/' +\
    args.algo + '/' +\
    'eps_' + str(args.eps) + '/' +\
    'lr_' + str(args.lr) + '/' +\
    'stat_decay_' + str(args.stat_decay) + '/'

    #     saving_dir = './result/' + args.env_name + '/' + args.algo + '/'
    saving_dir = './result/' + dir_with_params

    if not os.path.isdir(saving_dir):
        os.makedirs(saving_dir)

    import pickle

    with open(saving_dir + 'result.pkl', 'wb') as handle:
        pickle.dump(
            {
                'record_rewards': record_rewards,
                'record_num_steps': record_num_steps
            }, handle)

    print('args.log_dir')
    print(args.log_dir)

    print('os.listdir(args.log_dir)')
    print(os.listdir(args.log_dir))

    #     saving_dir_monitor = './result_monitor/' + args.env_name + '/' + args.algo + '/'

    saving_dir_monitor = './result_monitor/' + dir_with_params

    if os.path.isdir(saving_dir_monitor):
        import shutil

        shutil.rmtree(saving_dir_monitor)

    if not os.path.isdir(saving_dir_monitor):
        os.makedirs(saving_dir_monitor)

    print('saving_dir_monitor')
    print(saving_dir_monitor)

    import shutil

    for file_name in os.listdir(args.log_dir):

        full_file_name = os.path.join(args.log_dir, file_name)

        print('full_file_name')
        print(full_file_name)

        print('os.path.isfile(full_file_name)')
        print(os.path.isfile(full_file_name))

        if os.path.isfile(full_file_name):
            shutil.copy(full_file_name, saving_dir_monitor)

#     print('os.listdir(saving_dir_monitor)')
#     print(os.listdir(saving_dir_monitor))

#     print('len(os.listdir(saving_dir_monitor))')
#     print(len(os.listdir(saving_dir_monitor)))

#     print('args.num_processes')
#     print(args.num_processes)

    assert len(os.listdir(saving_dir_monitor)) == args.num_processes
Пример #21
0
        def run(self, time, S_time_interval, S_send_data_size, S_chunk_len, S_rebuf, S_buffer_size, S_play_time_len,
                S_end_delay, S_decision_flag, S_buffer_flag, S_cdn_flag, end_of_video, cdn_newest_id, download_id,
                cdn_has_frame, IntialVars):
            torch.set_num_threads(1)
            device = torch.device("cuda:0" if args.cuda else "cpu")

            if args.vis:
                from visdom import Visdom
                viz = Visdom(port=args.port)
                win = None

            # The online env in AItrans, it should have the observation space, action space and so on
            # We should step into the depth of envs.py in the github doc, and extract the format of observation
            # and action space
            envs =

            actor_critic = Policy(envs.observation_space.shape, envs.action_space,
                                  base_kwargs={'recurrent': args.recurrent_policy})
            actor_critic.to(device)

            # choose the algorithm, now we only have a2c
            if args.algo == 'a2c':
                agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                                       args.entropy_coef, lr=args.lr,
                                       eps=args.eps, alpha=args.alpha,
                                       max_grad_norm=args.max_grad_norm)
            elif args.algo == 'ppo':
                agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                                 args.value_loss_coef, args.entropy_coef, lr=args.lr,
                                 eps=args.eps,
                                 max_grad_norm=args.max_grad_norm)
            elif args.algo == 'acktr':
                agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                                       args.entropy_coef, acktr=True)

            rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                      envs.observation_space.shape, envs.action_space,
                                      actor_critic.recurrent_hidden_state_size)

            # the initial observation
            obs =
            rollouts.obs[0].copy_(obs)
            rollouts.to(device)

            episode_reward = deque(maxlen=10)
            start = time.time()
            for j in range(num_updates):

                if args.use_linear_lr_decay:
                    # decrease learning rate linearly
                    if args.algo == "acktr":
                        # use optimizer's learning rate since it's hard-coded in kfac.py
                        update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr)
                    else:
                        update_linear_schedule(agent.optimizer, j, num_updates, args.lr)

                if args.algo == 'ppo' and args.use_linear_lr_decay:
                    agent.clip_param = args.clip_param * (1 - j / float(num_updates))

                for step in range(args.num_steps):
                    # Sample actions
                    with torch.no_grad():
                        value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                            rollouts.obs[step],
                            rollouts.recurrent_hidden_states[step],
                            rollouts.masks[step])
Пример #22
0
def main():

    print('Preparing parameters')

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    print('Creating envs: {}'.format(args.env_name))

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    # input(envs)
    print('Creating network')
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    print('Initializing PPO')
    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    print('Memory')
    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    num_episodes = [0 for _ in range(args.num_processes)]

    if args.run_id == "debug":
        try:
            shutil.rmtree('./runs/debug')
        except:
            pass

    writer = SummaryWriter("./runs/{}".format(args.run_id))
    with open('./runs/{}/recap.txt'.format(args.run_id), 'w') as file:
        file.write(str(actor_critic))

    last_index = 0

    print('Starting ! ')

    start = time.time()
    for j in tqdm(range(num_updates)):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = actor_critic.act(
                    rollouts.obs[step], rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info_num, info in enumerate(infos):
                if (info_num == 0):
                    if 'episode' in info.keys():
                        episode_rewards.append(info['episode']['r'])
                        end_episode_to_viz(writer, info, info_num,
                                           num_episodes[info_num])
                        num_episodes[info_num] += 1

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)
        losses = agent.update(rollouts)
        rollouts.after_update()

        losses_to_viz(writer, losses, j)
        create_checkpoint(actor_critic, envs, args)
        last_index = global_rew_to_viz(writer, last_index)
Пример #23
0
def main():
    config = None
    args = get_args()
    config, checkpoint = get_config_and_checkpoint(args)

    set_random_seeds(args, config)
    eval_log_dir = args.save_dir + "_eval"
    try:
        os.makedirs(args.save_dir)
        os.makedirs(eval_log_dir)
    except OSError:
        pass

    now = datetime.datetime.now()
    experiment_name = args.experiment_name + '_' + now.strftime(
        "%Y-%m-%d_%H-%M-%S")

    # Create checkpoint file
    save_dir_model = os.path.join(args.save_dir, 'model', experiment_name)
    save_dir_config = os.path.join(args.save_dir, 'config', experiment_name)
    try:
        os.makedirs(save_dir_model)
        os.makedirs(save_dir_config)
    except OSError as e:
        logger.error(e)
        exit()

    if args.config:
        shutil.copy2(args.config, save_dir_config)

    # Tensorboard Logging
    writer = SummaryWriter(
        os.path.join(args.save_dir, 'tensorboard', experiment_name))

    # Logger that writes to STDOUT and a file in the save_dir
    logger = setup_carla_logger(args.save_dir, experiment_name)

    device = torch.device("cuda:0" if args.cuda else "cpu")
    norm_reward = not config.no_reward_norm
    norm_obs = not config.no_obs_norm

    assert not (config.num_virtual_goals > 0) or (
        config.reward_class
        == 'SparseReward'), 'Cant use HER with dense reward'
    obs_converter = CarlaObservationConverter(
        h=84, w=84, rel_coord_system=config.rel_coord_system)
    action_converter = CarlaActionsConverter(config.action_type)
    envs = make_vec_envs(obs_converter,
                         action_converter,
                         args.starting_port,
                         config.seed,
                         config.num_processes,
                         config.gamma,
                         device,
                         config.reward_class,
                         num_frame_stack=1,
                         subset=config.experiments_subset,
                         norm_reward=norm_reward,
                         norm_obs=norm_obs,
                         apply_her=config.num_virtual_goals > 0,
                         video_every=args.video_interval,
                         video_dir=os.path.join(args.save_dir, 'video',
                                                experiment_name))

    if config.agent == 'forward':
        agent = agents.ForwardCarla()

    if config.agent == 'a2c':
        agent = agents.A2CCarla(obs_converter,
                                action_converter,
                                config.value_loss_coef,
                                config.entropy_coef,
                                lr=config.lr,
                                eps=config.eps,
                                alpha=config.alpha,
                                max_grad_norm=config.max_grad_norm)

    elif config.agent == 'acktr':
        agent = agents.A2CCarla(obs_converter,
                                action_converter,
                                config.value_loss_coef,
                                config.entropy_coef,
                                lr=config.lr,
                                eps=config.eps,
                                alpha=config.alpha,
                                max_grad_norm=config.max_grad_norm,
                                acktr=True)

    elif config.agent == 'ppo':
        agent = agents.PPOCarla(obs_converter,
                                action_converter,
                                config.clip_param,
                                config.ppo_epoch,
                                config.num_mini_batch,
                                config.value_loss_coef,
                                config.entropy_coef,
                                lr=config.lr,
                                eps=config.eps,
                                max_grad_norm=config.max_grad_norm)

    if checkpoint is not None:
        load_modules(agent.optimizer, agent.model, checkpoint)

    rollouts = RolloutStorage(config.num_steps, config.num_processes,
                              envs.observation_space, envs.action_space, 20,
                              config.num_virtual_goals,
                              config.rel_coord_system, obs_converter)

    obs = envs.reset()
    # Save the first observation
    obs = obs_to_dict(obs)
    rollouts.obs = obs_to_dict(rollouts.obs)
    for k in rollouts.obs:
        rollouts.obs[k][rollouts.step + 1].copy_(obs[k])
    rollouts.obs = dict_to_obs(rollouts.obs)
    rollouts.to(device)

    start = time.time()

    total_steps = 0
    total_episodes = 0
    total_reward = 0

    episode_reward = torch.zeros(config.num_processes)

    for j in range(config.num_updates):

        for step in range(config.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = agent.act(
                    rollouts.get_obs(step),
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Observe reward and next obs
            obs, reward, done, info = envs.step(action)

            # For logging purposes
            carla_rewards = torch.tensor([i['carla-reward'] for i in info],
                                         dtype=torch.float)
            episode_reward += carla_rewards
            total_reward += carla_rewards.sum().item()
            total_steps += config.num_processes

            if done.any():
                total_episodes += done.sum()
                torch_done = torch.tensor(done.astype(int)).byte()
                mean_episode_reward = episode_reward[torch_done].mean().item()
                logger.info('{} episode(s) finished with reward {}'.format(
                    done.sum(), mean_episode_reward))
                writer.add_scalar('train/mean_ep_reward_vs_steps',
                                  mean_episode_reward, total_steps)
                writer.add_scalar('train/mean_ep_reward_vs_episodes',
                                  mean_episode_reward, total_episodes)
                episode_reward[torch_done] = 0

            # If done then clean the history of observations.
            masks = torch.FloatTensor(1 - done)

            rollouts.insert(obs, recurrent_hidden_states,
                            action, action_log_prob, value, reward,
                            masks.unsqueeze(-1))

        if config.num_virtual_goals > 0:
            rollouts.apply_her(config.num_virtual_goals,
                               device,
                               beta=config.beta)

        with torch.no_grad():
            next_value = agent.get_value(
                rollouts.get_obs(-1),  # Get last observation
                rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, config.use_gae, config.gamma,
                                 config.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "" and config.agent != 'forward':
            save_path = os.path.join(save_dir_model, str(j) + '.pth.tar')
            save_modules(agent.optimizer, agent.model, args, config, save_path)

        total_num_steps = (j + 1) * config.num_processes * config.num_steps

        if j % args.log_interval == 0:

            # Logging to the stdout/our logs
            end = time.time()
            logger.info('------------------------------------')
            logger.info('Episodes {}, Updates {}, num timesteps {}, FPS {}'\
                .format(total_episodes, j + 1, total_num_steps, total_num_steps / (end - start)))
            logger.info('------------------------------------')

            # Logging to tensorboard
            writer.add_scalar('train/cum_reward_vs_steps', total_reward,
                              total_steps)
            writer.add_scalar('train/cum_reward_vs_updates', total_reward,
                              j + 1)

            if config.agent in ['a2c', 'acktr', 'ppo']:
                writer.add_scalar('debug/value_loss_vs_steps', value_loss,
                                  total_steps)
                writer.add_scalar('debug/value_loss_vs_updates', value_loss,
                                  j + 1)
                writer.add_scalar('debug/action_loss_vs_steps', action_loss,
                                  total_steps)
                writer.add_scalar('debug/action_loss_vs_updates', action_loss,
                                  j + 1)
                writer.add_scalar('debug/dist_entropy_vs_steps', dist_entropy,
                                  total_steps)
                writer.add_scalar('debug/dist_entropy_vs_updates',
                                  dist_entropy, j + 1)

            # Sample the last reward
            writer.add_scalar('debug/sampled_normalized_reward_vs_steps',
                              reward.mean(), total_steps)
            writer.add_scalar('debug/sampled_normalized_reward_vs_updates',
                              reward.mean(), j + 1)
            writer.add_scalar('debug/sampled_carla_reward_vs_steps',
                              carla_rewards.mean(), total_steps)
            writer.add_scalar('debug/sampled_carla_reward_vs_updates',
                              carla_rewards.mean(), j + 1)

        if (args.eval_interval is not None and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name, args.starting_port,
                                      obs_converter,
                                      args.x + config.num_processes,
                                      config.num_processes, config.gamma,
                                      eval_log_dir, config.add_timestep,
                                      device, True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(config.num_processes,
                                                       20,
                                                       device=device)
            eval_masks = torch.zeros(config.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = agent.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                carla_obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            logger.info(
                " Evaluation using {} episodes: mean reward {:.5f}\n".format(
                    len(eval_episode_rewards), np.mean(eval_episode_rewards)))
Пример #24
0
def main():
    writer = SummaryWriter()
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    best_score = 0

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False, 4, args.carl_wrapper)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          args.activation,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    assert (args.algo == 'a2c')
    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    beta_device = (torch.ones(args.num_processes, 1)).to(device)
    masks_device = torch.ones(args.num_processes, 1).to(device)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    obs = obs / 255
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    g_step = 0
    for j in range(num_updates):
        for step in range(args.num_steps):
            # sample actions
            g_step += 1
            eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(
                -1. * g_step / EPS_DECAY)
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states, ori_dist_entropy = actor_critic.act(
                    rollouts.obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step],
                    deterministic=True)
            ori_dist_entropy = ori_dist_entropy.cpu().unsqueeze(1)

            # select action based on epsilon greedy
            rand_val = torch.rand(action.shape).to(device)
            eps_mask = (rand_val >= eps_threshold).type(torch.int64)
            rand_action = torch.LongTensor([
                envs.action_space.sample() for i in range(args.num_processes)
            ]).unsqueeze(1).to(device)
            action = eps_mask * action + (1 - eps_mask) * rand_action
            obs, reward, done, infos = envs.step(action)
            obs = obs / 255

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            if args.log_evaluation:
                writer.add_scalar('analysis/reward', reward[0], g_step)
                writer.add_scalar('analysis/entropy',
                                  ori_dist_entropy[0].item(), g_step)
                writer.add_scalar('analysis/eps', eps_threshold, g_step)
                if done[0]:
                    writer.add_scalar('analysis/done', 1, g_step)

            # save model
            for idx in range(len(infos)):
                info = infos[idx]
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    steps_done = g_step * args.num_processes + idx
                    writer.add_scalar('data/reward', info['episode']['r'],
                                      steps_done)
                    mean_rewards = np.mean(episode_rewards)
                    writer.add_scalar('data/avg_reward', mean_rewards,
                                      steps_done)
                    if mean_rewards > best_score:
                        best_score = mean_rewards
                        save_model = actor_critic
                        if args.cuda:
                            save_model = copy.deepcopy(actor_critic).cpu()
                        torch.save(
                            save_model,
                            os.path.join(save_path, args.env_name + ".pt"))

            # update storage
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, beta_device)

        with torch.no_grad():
            masks_device.copy_(masks)
            next_value = actor_critic.get_value(obs, recurrent_hidden_states,
                                                masks_device)

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

    writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
Пример #25
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    """
    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
    """

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=100)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            """
            for info in infos:
                if 'episode' in info.keys():
                    print(reward)
                    episode_rewards.append(info['episode']['r'])
            """

            # FIXME: works only for environments with sparse rewards
            for idx, eps_done in enumerate(done):
                if eps_done:
                    episode_rewards.append(reward[idx])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            print('Saving model')
            print()

            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n"
                .format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    len(episode_rewards), np.mean(episode_rewards),
                    np.median(episode_rewards), np.min(episode_rewards),
                    np.max(episode_rewards),
                    np.count_nonzero(np.greater(episode_rewards, 0)) /
                    len(episode_rewards)))

        if args.eval_interval is not None and len(
                episode_rewards) > 1 and j % args.eval_interval == 0:
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            if eval_envs.venv.__class__.__name__ == "VecNormalize":
                eval_envs.venv.ob_rms = envs.venv.ob_rms

                # An ugly hack to remove updates
                def _obfilt(self, obs):
                    if self.ob_rms:
                        obs = np.clip((obs - self.ob_rms.mean) /
                                      np.sqrt(self.ob_rms.var + self.epsilon),
                                      -self.clipob, self.clipob)
                        return obs
                    else:
                        return obs

                eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv)

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))
        """
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
        """

    envs.close()
Пример #26
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    run_id = "alpha{}".format(args.gcn_alpha)
    if args.use_logger:
        from utils import Logger
        folder = "{}/{}".format(args.folder, run_id)
        logger = Logger(algo_name=args.algo,
                        environment_name=args.env_name,
                        folder=folder,
                        seed=args.seed)
        logger.save_args(args)

        print("---------------------------------------")
        print('Saving to', logger.save_folder)
        print("---------------------------------------")

    else:
        print("---------------------------------------")
        print('NOTE : NOT SAVING RESULTS')
        print("---------------------------------------")
    all_rewards = []

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          args.env_name,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size,
                              actor_critic.base.output_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    ############################
    # GCN Model and optimizer
    from pygcn.train import update_graph
    from pygcn.models import GCN, GAT, SAGE
    assert args.gnn in ['gcn', 'gat', 'sage']

    if args.gnn == 'gat':
        gcn_model = GAT(nfeat=actor_critic.base.output_size,
                        nhid=args.gcn_hidden)
    elif args.gnn == 'sage':
        gcn_model = SAGE(nfeat=actor_critic.base.output_size,
                         nhid=args.gcn_hidden)
    elif args.gnn == 'gcn':
        gcn_model = GCN(nfeat=actor_critic.base.output_size,
                        nhid=args.gcn_hidden)

    gcn_model.to(device)
    gcn_optimizer = optim.Adam(gcn_model.parameters(),
                               lr=args.gcn_lr,
                               weight_decay=args.gcn_weight_decay)
    gcn_loss = nn.NLLLoss()
    gcn_states = [[] for _ in range(args.num_processes)]
    Gs = [nx.Graph() for _ in range(args.num_processes)]
    node_ptrs = [0 for _ in range(args.num_processes)]
    rew_states = [[] for _ in range(args.num_processes)]
    ############################

    episode_rewards = deque(maxlen=100)
    avg_fwdloss = deque(maxlen=100)
    rew_rms = RunningMeanStd(shape=())
    delay_rew = torch.zeros([args.num_processes, 1])
    delay_step = torch.zeros([args.num_processes])

    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob,\
                 recurrent_hidden_states, hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            delay_rew += reward
            delay_step += 1

            for idx, (info, hid,
                      eps_done) in enumerate(zip(infos, hidden_states, done)):

                if eps_done or delay_step[idx] == args.reward_freq:
                    reward[idx] = delay_rew[idx]
                    delay_rew[idx] = delay_step[idx] = 0
                else:
                    reward[idx] = 0

                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

                if args.gcn_alpha < 1.0:
                    gcn_states[idx].append(hid)
                    node_ptrs[idx] += 1
                    if not eps_done:
                        Gs[idx].add_edge(node_ptrs[idx] - 1, node_ptrs[idx])
                    if reward[idx] != 0. or eps_done:
                        rew_states[idx].append(
                            [node_ptrs[idx] - 1, reward[idx]])
                    if eps_done:
                        adj = nx.adjacency_matrix(Gs[idx]) if len(Gs[idx].nodes)\
                                        else sp.csr_matrix(np.eye(1,dtype='int64'))
                        update_graph(gcn_model, gcn_optimizer,
                                     torch.stack(gcn_states[idx]), adj,
                                     rew_states[idx], gcn_loss, args, envs)
                        gcn_states[idx] = []
                        Gs[idx] = nx.Graph()
                        node_ptrs[idx] = 0
                        rew_states[idx] = []

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks,
                            hidden_states)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau, gcn_model, args.gcn_alpha)
        agent.update(rollouts)
        rollouts.after_update()

        ####################### Saving and book-keeping #######################
        if (j % int(num_updates / 5.) == 0
                or j == num_updates - 1) and args.save_dir != "":
            print('Saving model')
            print()

            save_dir = "{}/{}/{}".format(args.save_dir, args.folder, run_id)
            save_path = os.path.join(save_dir, args.algo, 'seed' +
                                     str(args.seed)) + '_iter' + str(j)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            save_gcn = gcn_model
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
                save_gcn = copy.deepcopy(gcn_model).cpu()

            save_model = [
                save_gcn, save_model,
                hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + "ac.pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {}\
             training episodes: mean/median reward {:.2f}/{:.2f},\
              min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n".format(
                j,
                total_num_steps,
                int(total_num_steps / (end - start)),
                len(episode_rewards),
                np.mean(episode_rewards),
                np.median(episode_rewards),
                np.min(episode_rewards),
                np.max(episode_rewards),
                np.count_nonzero(np.greater(episode_rewards, 0)) /
                len(episode_rewards),
            ))

            all_rewards.append(np.mean(episode_rewards))
            if args.use_logger:
                logger.save_task_results(all_rewards)
        ####################### Saving and book-keeping #######################

    envs.close()
Пример #27
0
def main():



    torch.manual_seed(args_seed)
    torch.cuda.manual_seed_all(args_seed)

    device = torch.device("cuda:0" if args_cuda else "cpu")

    train_log = Log(log_name+'_train_log')
    evl_log = Log(log_name+'_evaluation_log')
    torch.set_num_threads(1)
    envs = make_vec_envs(
        args_env_name,
        args_seed,
        args_num_processes,
        device,
        gamma=args_gamma)

    # norm_envs = get_vec_normalize(envs)
    # norm_envs = envs
    # norm_envs.eval()
    # norm_envs.ob_rms = 1
    # print(envs.ob_rms)
    # ss('hi')
    if is_limit_action:
        envs.action_space.n = 3
    print('Number of Actions:', envs.action_space.n)

    actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space,
        base_kwargs={'recurrent': args_recurrent_policy})
    actor_critic.to(device)
    # print(actor_critic.is_recurrent)
    # print(actor_critic.gru)
    # ss('hi')

    agent = PPO(
        actor_critic,
        args_clip_param,
        args_ppo_epoch,
        args_num_mini_batch,
        args_value_loss_coef,
        args_entropy_coef,
        lr=args_lr,
        eps=args_eps,
        max_grad_norm=args_max_grad_norm,
        use_clipped_value_loss=args_use_clipped_value_loss)

    rollouts = RolloutStorage(
        args_num_steps,
        args_num_processes,
        envs.observation_space.shape,
        envs.action_space,
        actor_critic.recurrent_hidden_state_size)


    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    # print(obs)
    # ss('i am over it')
    num_updates = int(
        args_num_env_steps) // args_num_steps // args_num_processes

    episode_rewards = deque(maxlen=10)
    start = time.time()
    sum_re = torch.zeros(args_num_processes, 1)

    for j in range(num_updates):

        if args_use_linear_lr_decay:
            # decrease learning rate linearly
            update_linear_schedule(
                agent.optimizer, j, num_updates,
                args_lr)

        for step in range(args_num_steps):

            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])
            # ss('dissecting actor critic. act')
            # print(action)
            # print()
            # action = action + 1
            # print(action)
            # ss('hoiohasdfhioas')
            if is_limit_action:
                obs, reward, done, infos = envs.step(action+1)
            else:
                obs, reward, done, infos = envs.step(action)
            sum_re += reward

            if any(done):

                for i in range(len(done)):
                    if done[i]:
                        episode_rewards.append(sum_re[i].item())
                        # print(done)
                        # print(sum_re[i])
                        sum_re[i] *= 0
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)
        with torch.no_grad():

            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value,
                                 args_gamma,
                                 args_use_gae,
                                 args_gae_lambda)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        if j % args_log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args_num_processes * args_num_steps
            end = time.time()
            logstring = "E {}, N_steps {}, FPS {} mean/median" \
                        " {:.1f}/{:.1f}, min/max {:.1f}/{:.1f}" \
                        " Entropy {:.5f},V {:.5f},Action {:.5f}".format(
                j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            np.mean(episode_rewards),
                            np.median(episode_rewards), np.min(episode_rewards),
                            np.max(episode_rewards),
                            dist_entropy, value_loss,
                            action_loss)
            # print(logstring)
            train_log.log(logstring)
        # if True:
        if (args_eval_interval is not None and len(episode_rewards) > 1
                and j % args_eval_interval == 0):
            total_num_steps = (j + 1) * args_num_processes * args_num_steps
            ob_rms = get_vec_normalize(envs).ob_rms
            ev_result = evaluate(actor_critic, ob_rms, args_env_name, args_seed,
                     args_num_processes, device, is_limit_action=is_limit_action)
            ev_log_string = 'steps:'+str(total_num_steps)+'. '+ev_result
            evl_log.log(ev_log_string)
Пример #28
0
def main():
    writer = SummaryWriter()
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    best_score = 0

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    if args.reward_mode == 0:
        clip_rewards = True
    else:
        clip_rewards = False
    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False, 4, args.carl_wrapper, clip_rewards, args.track_primitive_reward)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space, args.activation, args.complex_model,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.observation_space.shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size)

    # initiate env and storage rollout
    obs = envs.reset()
    obs = obs/255
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    # necessary variabels
    episode_rewards = deque(maxlen=10) # store last 10 episode rewards
    g_step = 0 # global step
    reward_history = set() # record reward history (after reward rescaling)
    primitive_reward_history = set() # record original history (before reward rescaling)
    min_abs_reward = float('inf') # used in reward rescaling mode 2, work as a base
    masks_device = torch.ones(args.num_processes, 1).to(device)  # mask on gpu
    reward_count = 0 # for reward density calculation
    reward_start_step = 0 # for reward density calculation
    insert_entropy = torch.ones(args.num_processes, 1)  # entropys inserte into rollout
    avg_entropy = 0  
    have_done = 0.0

    num_feature_neurons = args.num_processes * 512
    for j in range(num_updates):
        if j == int((num_updates-1)*have_done):
            if args.save_intermediate_model:
                save_model = actor_critic
                if args.cuda:
                    save_model = copy.deepcopy(actor_critic).cpu()
                torch.save(save_model, os.path.join(save_path, args.env_name + str(have_done)+".pt")) 
            print("have done: ", have_done)
            have_done += 0.1

        for step in range(args.num_steps):
            # Sample actions
            g_step += 1
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states, entropy, f_a = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            if args.track_hidden_stats:
                # analyze the stats of f_a 
                mean_fa = torch.mean(f_a)
                num_nonzero = f_a.nonzero().size(0)
                mean_pos = mean_fa * num_feature_neurons / num_nonzero
                activation_ratio = f_a / mean_pos
                num_bigger_mean_fa = torch.sum(activation_ratio > 1).item()
                num_bigger_half_fa = torch.sum(activation_ratio > 0.5).item()
                writer.add_scalar('analysis/fa_mean_ratio', (num_nonzero - num_bigger_mean_fa)/num_nonzero, g_step)
                writer.add_scalar('analysis/fa_0.5_ratio', (num_nonzero - num_bigger_half_fa)/num_nonzero, g_step)
                writer.add_scalar('analysis/fa_active', num_nonzero/num_feature_neurons, g_step)

                # analyze the stats of entropy
                avg_entropy = 0.999*avg_entropy + 0.001*torch.mean(entropy).item()
                num_all = len(entropy.view(-1))
                entropy_ratio = entropy/avg_entropy
                num_larger_mean = sum(entropy_ratio > 1).item()
                num_larger_onehalf = sum(entropy_ratio > 1.5).item()
                num_larger_double = sum(entropy_ratio > 2).item()
                writer.add_scalar('analysis/entropy_mean_ratio', num_larger_mean/num_all, g_step)
                writer.add_scalar('analysis/entropy_1.5_ratio', num_larger_onehalf/num_all, g_step)
                writer.add_scalar('analysis/entropy_2_ratio', num_larger_double/num_all, g_step)

            # update entropy inserted into rollout when appropriate 
            if args.modulation and j > args.start_modulate * num_updates:
                insert_entropy = entropy.unsqueeze(1)

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            obs = obs/255

            # reward rescaling
            if args.reward_mode == 1:
                reward = reward * args.reward_scale
            elif args.reward_mode == 2:
                if j < args.change_base_reward * num_updates:
                    non_zeros = abs(reward[reward != 0])
                    if len(non_zeros) > 0:
                        min_abs_reward_step = torch.min(non_zeros).item()
                        if min_abs_reward > min_abs_reward_step:
                            min_abs_reward = min_abs_reward_step
                            print('new min abs reward: ', min_abs_reward, ' time: ', g_step)
                if min_abs_reward != float('inf'):
                    reward = reward/min_abs_reward

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            if args.log_evaluation:
                writer.add_scalar('analysis/entropy', entropy.mean().item(), g_step)
                if args.track_reward_density:   # track reward density, based on 0th process
                    reward_count += (reward[0] != 0)
                    if 'episode' in infos[0].keys():
                        writer.add_scalar('analysis/reward_density', reward_count/(g_step - reward_start_step), g_step)
                        reward_count = 0
                        reward_start_step = g_step
                if args.track_primitive_reward:   # track primitive reward (before rescaling)
                    for info in infos:
                        if 'new_reward' in info:
                            new_rewards  = info['new_reward'] - primitive_reward_history
                            if len(new_rewards) > 0:
                                print('new primitive rewards: ', new_rewards, ' time: ', g_step)
                                primitive_reward_history =  primitive_reward_history.union(info['new_reward'])
                if args.track_scaled_reward:  # track rewards after rescaling
                    for r in reward:
                        r = r.item()
                        if r not in reward_history:
                            print('new step rewards: ', r, g_step)
                            reward_history.add(r)


            for idx in range(len(infos)):
                info = infos[idx]
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    steps_done = g_step*args.num_processes + idx
                    writer.add_scalar('data/reward', info['episode']['r'], steps_done)
                    mean_rewards = np.mean(episode_rewards)
                    writer.add_scalar('data/avg_reward', mean_rewards, steps_done)
                    if mean_rewards > best_score:
                        best_score = mean_rewards
                        save_model = actor_critic
                        if args.cuda:
                            save_model = copy.deepcopy(actor_critic).cpu()
                        torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))                        
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, insert_entropy)

        with torch.no_grad():
            masks_device.copy_(masks)
            next_value = actor_critic.get_value(obs, recurrent_hidden_states, masks_device)

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy, value = agent.update(rollouts, args.modulation)

        if args.track_value_loss:
            writer.add_scalar('analysis/value_loss', value_loss, j)
            writer.add_scalar('analysis/value', value, j)
            writer.add_scalar('analysis/loss_ratio', value_loss/value, j)

        if args.modulation and  args.track_lr and args.log_evaluation:
            writer.add_scalar('analysis/min_lr', torch.min(rollouts.lr).item(), j)
            writer.add_scalar('analysis/max_lr', torch.max(rollouts.lr).item(), j)
            writer.add_scalar('analysis/std_lr', torch.std(rollouts.lr).item(), j)
            writer.add_scalar('analysis/avg_lr', torch.mean(rollouts.lr).item(), j)

        rollouts.after_update()

    writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
Пример #29
0
rollouts = RolloutStorage(num_steps=OUTER_BATCHSIZE,
                          num_processes=NUM_PROCESS,
                          obs_shape=envs.observation_space.shape,
                          action_space=envs.action_space,
                          recurrent_hidden_state_size=1)

inner_rollouts = RolloutStorage(num_steps=INNER_BATCHSIZE,
                                num_processes=NUM_PROCESS,
                                obs_shape=envs.observation_space.shape,
                                action_space=envs.action_space,
                                recurrent_hidden_state_size=1)

obs = envs.reset()
rollouts.obs[0].copy_(obs)
rollouts.to(device)
inner_rollouts.obs[0].copy_(obs)
inner_rollouts.to(device)
episode_rewards = deque(maxlen=10)
total_num_steps = 0


def select_action(obs):
    with torch.no_grad():
        action_mean, log_std = actor(obs)
        action = torch.normal(action_mean, torch.exp(log_std))
        var = torch.exp(log_std)**2
        action_log_probs = -(
            (action - action_mean)**2) / (2 * var) - log_std - math.log(
                math.sqrt(2 * math.pi))
        action_log_probs = action_log_probs.sum(1, keepdim=True)
Пример #30
0
class Runner():
    def __init__(self, **args):
        cuda = not args['no_cuda'] and torch.cuda.is_available()
        self.device = torch.device("cuda:0" if cuda else "cpu")
        print("Model running on device: {}".format(self.device))
        torch.set_num_threads(1)

        self.env_name = args['env_name']
        self.epochs = args['epochs']
        self.num_processes = args['num_processes']
        self.num_steps = args['num_steps']
        self.num_test_episodes = args['num_test_episodes']
        self.test_every_n_epochs = args['test_every_n_epochs']
        self.use_deterministic_policy_while_testing = args['use_deterministic_policy_while_testing']

        self.grayscale = args['grayscale']
        self.skip_frame = args['skip_frame']
        self.num_frame_stack = args['num_frame_stack']

        self.num_updates_per_epoch = args['num_updates_per_epoch']
        self.num_steps = args['num_steps']

        self.use_gae = args['use_gae']
        self.gamma = args['gamma']
        self.tau = args['tau']

        self.reward_scaling = args['reward_scaling']

        self.seed = args['seed']
        self.log_dir = args['log_dir']
        self.save_dir = args['save_dir']

        try:
            os.makedirs(args['log_dir'])
            files = glob.glob(os.path.join(args['log_dir'], '*.manifest.json'))
            for f in files:
                os.remove(f)
        except OSError:
            files = glob.glob(os.path.join(args['log_dir'], '*.monitor.csv'))
            for f in files:
                os.remove(f)

        self.eval_log_dir = args['log_dir'] + "_eval"

        try:
            os.makedirs(self.eval_log_dir)
        except OSError:
            files = glob.glob(os.path.join(self.eval_log_dir, '*.monitor.csv'))
            for f in files:
                os.remove(f)

        self.envs = make_vec_envs(self.env_name, self.seed, self.num_processes,
                                  self.gamma, self.log_dir, self.device, False, self.grayscale, self.skip_frame, self.reward_scaling, num_frame_stack=self.num_frame_stack)

        self.algorithm = args['algorithm']
        # Decreasing LR scheduler
        self.scheduler = None

        if self.algorithm == 'A2C':
            actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space,
                                              base_kwargs=args['policy_parameters'])
            actor_critic.to(self.device)
            self.policy = actor_critic
            self.agent = A2C(actor_critic, **args['algorithm_parameters'])

        elif self.algorithm == 'PPO':
            if(args['decreasing_lr']):
                def lambdalr(epoch): return ((float(self.epochs - epoch)) / float(self.epochs) * args['algorithm_parameters']['lr'])  # noqa: E704
                actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space,
                                                  base_kwargs=args['policy_parameters'])
                actor_critic.to(self.device)
                self.policy = actor_critic
                self.agent = PPO(actor_critic, lambdalr, **
                                 args['algorithm_parameters'])
                self.scheduler = self.agent.scheduler
            else:
                actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space,
                                                  base_kwargs=args['policy_parameters'])
                actor_critic.to(self.device)
                self.policy = actor_critic
                self.agent = PPO(actor_critic, None, **
                                 args['algorithm_parameters'])

        self.rollouts = RolloutStorage(self.num_steps, self.num_processes,
                                       self.envs.observation_space.shape, self.envs.action_space,
                                       actor_critic.recurrent_hidden_state_size)
        obs = self.envs.reset()
        self.rollouts.obs[0].copy_(obs)
        self.rollouts.to(self.device)
        self.episode_rewards = deque(maxlen=50)
        self.writer = SummaryWriter(
            comment="{}-{}".format(self.env_name, self.algorithm))

    def run(self):
        start = time.time()
        for epoch in range(self.epochs):
            value_losses, action_losses, dist_entropies = [], [], []
            print("\nEpoch %d\n-------" % (epoch + 1))
            for j in trange(self.num_updates_per_epoch, leave=False):
                for step in range(self.num_steps):
                    # Sample actions
                    with torch.no_grad():
                        value, action, action_log_prob, recurrent_hidden_states = self.policy.act(
                            self.rollouts.obs[step],
                            self.rollouts.recurrent_hidden_states[step],
                            self.rollouts.masks[step])

                    # Observe reward and next obs
                    obs, reward, done, infos = self.envs.step(action)
                    for info in infos:
                        if 'episode' in info.keys():
                            print("New episode")
                            self.episode_rewards.append(info['episode']['r'])

                    # If done then clean the history of observations.
                    masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                               for done_ in done])
                    self.rollouts.insert(obs, recurrent_hidden_states,
                                         action, action_log_prob, value, reward, masks)

                with torch.no_grad():
                    next_value = self.policy.get_value(self.rollouts.obs[-1],
                                                       self.rollouts.recurrent_hidden_states[-1],
                                                       self.rollouts.masks[-1]).detach()

                self.rollouts.compute_returns(
                    next_value, self.use_gae, self.gamma, self.tau)
                value_loss, action_loss, dist_entropy = self.agent.update(
                    self.rollouts)
                value_losses.append(value_loss)
                action_losses.append(action_loss)
                dist_entropies.append(dist_entropy)

                self.rollouts.after_update()

                total_num_steps = (epoch + 1) * (j + 1) * \
                    self.num_processes * self.num_steps

            end = time.time()
            print("Total timesteps: {}, FPS: {}".format(
                total_num_steps, int(total_num_steps / (end - start))))
            print("Statistic of the last %d episodes played" %
                  len(self.episode_rewards))
            if(len(self.episode_rewards) < 1):
                self.episode_rewards.append(0)
            episode_rewards_np = np.array(self.episode_rewards)
            value_losses = np.array(value_losses)
            action_losses = np.array(action_losses)
            dist_entropies = np.array(dist_entropies)
            print("Mean value loss: {}, Mean action loss: {}, Mean entropy: {}".format(
                value_losses.mean(), action_losses.mean(), dist_entropies.mean()))
            print(episode_rewards_np)
            print("Results: mean: {} +/- {}".format(np.mean(episode_rewards_np), np.std(episode_rewards_np)))
            print("Min: {}, Max: {}, Median: {}".format(np.min(episode_rewards_np), np.max(episode_rewards_np), np.median(episode_rewards_np)))

            self.writer.add_scalar(
                'value_loss/mean', value_losses.mean(), epoch)
            self.writer.add_scalar(
                'action_loss/mean', action_losses.mean(), epoch)
            self.writer.add_scalar(
                'dist_entropy/mean', dist_entropies.mean(), epoch)
            self.writer.add_scalar(
                'reward/mean', episode_rewards_np.mean(), epoch)
            self.writer.add_scalar(
                'reward/max', episode_rewards_np.max(), epoch)
            self.writer.add_scalar(
                'reward/min', episode_rewards_np.min(), epoch)

            if (epoch + 1) % self.test_every_n_epochs == 0:
                print("\nTesting...")
                bar = tqdm(total=self.num_test_episodes, leave=False)
                eval_envs = make_vec_envs(self.env_name, self.seed + self.num_processes,
                                          self.num_processes, self.gamma, self.eval_log_dir,
                                          self.device,
                                          True,
                                          self.grayscale, self.skip_frame, self.reward_scaling, num_frame_stack=self.num_frame_stack)
                vec_norm = get_vec_normalize(eval_envs)
                if vec_norm is not None:
                    vec_norm.eval()
                    vec_norm.ob_rms = get_vec_normalize(self.envs).ob_rm
                eval_episode_rewards = []
                obs = eval_envs.reset()
                eval_recurrent_hidden_states = torch.zeros(self.num_processes,
                                                           self.policy.recurrent_hidden_state_size, device=self.device)
                eval_masks = torch.zeros(
                    self.num_processes, 1, device=self.device)

                while len(eval_episode_rewards) < self.num_test_episodes:
                    with torch.no_grad():
                        _, action, _, eval_recurrent_hidden_states = self.policy.act(
                            obs, eval_recurrent_hidden_states, eval_masks, deterministic=self.use_deterministic_policy_while_testing)
                    # Obser reward and next obs
                    obs, reward, done, infos = eval_envs.step(action)
                    eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                    for done_ in done])

                    for info in infos:
                        if 'episode' in info.keys():
                            bar.update(1)
                            eval_episode_rewards.append(
                                info['episode']['r'])
                eval_envs.close()
                bar.close()
                print(eval_episode_rewards)
                print(" Evaluation using {} episodes: mean reward {:.5f}, min/max {}/{}\n".
                      format(len(eval_episode_rewards),
                             np.mean(eval_episode_rewards), np.min(eval_episode_rewards), np.max(eval_episode_rewards)))

            print("Total elapsed time: %.2f minutes" %
                  ((time.time() - start) / 60.0))
            if self.scheduler is not None:
                print("Decreasing the learning rate...")
                self.scheduler.step()

            print("Saving the model...")
            save_path = os.path.join(self.save_dir, self.algorithm)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            save_model = self.policy
            if self.device == "cuda:0":
                save_model = copy.deepcopy(self.policy).cpu()
            save_model = [save_model,
                          getattr(get_vec_normalize(self.envs), 'ob_rms', None)]
            torch.save(save_model, os.path.join(
                save_path, self.env_name + ".pt"))