예제 #1
0
def main():
    os.environ['OMP_NUM_THREADS'] = '1'

    envs = [make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
    obs_numel = reduce(operator.mul, obs_shape, 1)

    actor_critic = Policy(obs_numel, envs.action_space)

    # Maxime: log some info about the model and its size
    modelSize = 0
    for p in actor_critic.parameters():
        pSize = reduce(operator.mul, p.size(), 1)
        modelSize += pSize
    print(str(actor_critic))
    print('Total model size: %d' % modelSize)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True)
            )
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            elif current_obs.dim() == 3:
                current_obs *= masks.unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True)
        )[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[:-1].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape))
            )

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) * action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch),
                        Variable(states_batch),
                        Variable(masks_batch),
                        Variable(actions_batch)
                    )

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) - values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(
                    j,
                    total_num_steps,
                    int(total_num_steps / (end - start)),
                    final_rewards.mean(),
                    final_rewards.median(),
                    final_rewards.min(),
                    final_rewards.max(), dist_entropy.data[0],
                    value_loss.data[0], action_loss.data[0]
                )
            )

        if args.vis and j % args.vis_interval == 0:
            win = visdom_plot(
                total_num_steps,
                final_rewards.mean()
            )
def main():
    device = 'cpu'
    acc_steps = []
    acc_scores = []
    torch.set_num_threads(1)

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space)

    actor_critic.to(device)

    agent = PPO(actor_critic,
                args.clip_param,
                args.ppo_epoch,
                args.num_mini_batch,
                args.value_loss_coef,
                args.entropy_coef,
                lr=args.lr,
                eps=args.eps,
                max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    episode_rewards = collections.deque(maxlen=10)

    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            update_linear_schedule(agent.optimizer, j, num_updates, args.lr)
            agent.clip_param = args.clip_param * (1 - j / float(num_updates))

        # Prepare demos
        demo_actions = np.zeros(
            (1, args.num_processes, envs.action_space.shape[0]))
        demo_states = np.zeros(
            (1, args.num_processes, envs.observation_space.shape[0]))

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = actor_critic.act(
                    rollouts.obs[step], rollouts.masks[step])

            # obs, reward and next obs
            demo_actions = np.concatenate(
                [demo_actions,
                 action.reshape(1, args.num_processes, -1)], 0)
            demo_states = np.concatenate([
                demo_states, rollouts.obs[step].reshape(
                    1, args.num_processes, -1)
            ], 0)

            # do one step
            obs, reward, done, infos = envs.step(action)

            if step > 1 and step % 1000 == 0:
                done = [True for _ in range(args.num_processes)]

            for info in infos:
                # if 'episode' in info.keys():
                #  episode_rewards.append(info['episode']['r'])
                r = 0
                for key, val in info.items():
                    if 'reward' in key:
                        r += val
                episode_rewards.append(r)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, action, action_log_prob,\
                            value, reward, masks)

        # Save demos:
        action_file_name = args.demos_dir + '/actions_step_' + str(j) + '.npy'
        state_file_name = args.demos_dir + '/states_step_' + str(j) + '.npy'
        policy_file_name = args.demos_dir + '/policy_step_' + str(j) + '.pth'
        np.save(action_file_name, demo_actions[1:])
        np.save(state_file_name, demo_states[1:])
        torch.save(actor_critic.state_dict(), policy_file_name)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir:
            save_path = os.path.join(args.save_dir, 'ppo')
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + '.pt'))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            print('Updates', j, 'num timesteps', len(episode_rewards),
                  '\n Last training episodes: mean/median reward',
                  '{:.1f}'.format(np.mean(episode_rewards)),
                  '/{:.1f}'.format(np.median(episode_rewards)),
                  'min/max reward', '{:.1f}'.format(np.min(episode_rewards)),
                  '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy',
                  dist_entropy, 'value loss', value_loss, 'action loss',
                  action_loss)

        if len(episode_rewards) > 1:
            acc_steps.append(total_num_steps)
            acc_scores.append(np.mean(episode_rewards))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _ = actor_critic.act(obs,
                                                    eval_masks,
                                                    deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print('Evaluation using', len(eval_episode_rewards),
                  'episodes: mean reward',
                  '{:.5f}\n'.format(np.mean(eval_episode_rewards)))

    scores_file_name = args.scores_dir + '/learner_scores_' + args.expe + '.npy'
    steps_file_name = args.scores_dir + '/learner_steps_' + args.expe + '.npy'
    np.save(scores_file_name, np.array(acc_scores))
    np.save(steps_file_name, np.array(acc_steps))
예제 #3
0
def main():

    is_limit_action = True
    # is_limit_action = False
    args_cuda = True
    # args_cuda = False

    torch.manual_seed(args_seed)
    torch.cuda.manual_seed_all(args_seed)

    device = torch.device("cuda:0" if args_cuda else "cpu")

    train_log = Log(log_name+'_train_log')
    evl_log = Log(log_name+'_evaluation_log')
    torch.set_num_threads(1)
    envs = make_vec_envs(
        args_env_name,
        args_seed,
        args_num_processes,
        device,
        gamma=args_gamma)
    if is_limit_action:
        envs.action_space.n = 3
    print('Number of Actions:', envs.action_space.n)

    actor_critic = Policy(
        envs.observation_space.shape,
        envs.action_space)
    actor_critic.to(device)

    agent = PPO(
        actor_critic,
        args_clip_param,
        args_ppo_epoch,
        args_num_mini_batch,
        args_value_loss_coef,
        args_entropy_coef,
        lr=args_lr,
        eps=args_eps,
        max_grad_norm=args_max_grad_norm,
        use_clipped_value_loss=args_use_clipped_value_loss)

    rollouts = RolloutStorage(
        args_num_steps,
        args_num_processes,
        envs.observation_space.shape,
        envs.action_space)


    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)
    # print(obs)
    # ss('i am over it')
    num_updates = int(
        args_num_env_steps) // args_num_steps // args_num_processes

    episode_rewards = deque(maxlen=10)
    start = time.time()
    sum_re = torch.zeros(args_num_processes, 1)

    for j in range(num_updates):

        for step in range(args_num_steps):
            with torch.no_grad():
                value, action, action_log_prob\
                    = actor_critic.act(rollouts.obs[step])
            # print(action)
            # print()
            # action = action + 1
            # print(action)
            # ss('hoiohasdfhioas')
            if is_limit_action:
                obs, reward, done, infos = envs.step(action+1)
            else:
                obs, reward, done, infos = envs.step(action)
            sum_re += reward

            if any(done):

                for i in range(len(done)):
                    if done[i]:
                        episode_rewards.append(sum_re[i].item())
                        # print(done)
                        # print(sum_re[i])
                        sum_re[i] *= 0
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, action,
                            action_log_prob,
                            value, reward,
                            masks, bad_masks)
        with torch.no_grad():

            next_value = actor_critic.get_value(
                rollouts.obs[-1])

        rollouts.compute_returns(next_value,
                                 args_gamma,
                                 args_use_gae,
                                 args_gae_lambda)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        if j % args_log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args_num_processes * args_num_steps
            end = time.time()
            logstring = "E {}, N_steps {}, FPS {} mean/median" \
                        " {:.1f}/{:.1f}, min/max {:.1f}/{:.1f}" \
                        " Entropy {:.5f},V {:.5f},Action {:.5f}".format(
                j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            np.mean(episode_rewards),
                            np.median(episode_rewards), np.min(episode_rewards),
                            np.max(episode_rewards),
                            dist_entropy, value_loss,
                            action_loss)
            # print(logstring)
            train_log.log(logstring)
        # if True:
        if (args_eval_interval is not None and len(episode_rewards) > 1
                and j % args_eval_interval == 0):
            total_num_steps = (j + 1) * args_num_processes * args_num_steps
            ob_rms = get_vec_normalize(envs).ob_rms
            ev_result = evaluate(actor_critic, ob_rms, args_env_name, args_seed,
                     args_num_processes, device, is_limit_action=is_limit_action)
            ev_log_string = 'steps:'+str(total_num_steps)+'. '+ev_result
            evl_log.log(ev_log_string)
예제 #4
0
def make_env():
    return gym.make(ENV_NAME)


# Parallelize environments
envs = [make_env for i in range(N_ENVS)]

envs = SubprocVecEnv(envs)

envs = VecNormalize(envs, gamma=GAMMA)

obs_shape = envs.observation_space.shape
# Print observation space so we know what we are dealing with.
print('Obs shape', obs_shape)

policy = Policy(obs_shape, envs.action_space)

optimizer = optim.Adam(policy.parameters(), lr=LR, eps=EPS)

# Intialize the tensor we will use everytime for the observation. See the note
# in update_current_obs for more
current_obs = torch.zeros(N_ENVS, *obs_shape)
obs = envs.reset()


def update_current_obs(obs):
    # we want to use the same tensor every time so just copy it over.
    obs = torch.from_numpy(obs).float()
    current_obs[:, :] = obs

예제 #5
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.observation_space.shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                          getattr(get_vec_normalize(envs), 'ob_rms', None)]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       len(episode_rewards),
                       np.mean(episode_rewards),
                       np.median(episode_rewards),
                       np.min(episode_rewards),
                       np.max(episode_rewards), dist_entropy,
                       value_loss, action_loss))

        if (args.eval_interval is not None
                and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(
                args.env_name, args.seed + args.num_processes, args.num_processes,
                args.gamma, eval_log_dir, args.add_timestep, device, True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(args.num_processes,
                            actor_critic.recurrent_hidden_state_size, device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".
                format(len(eval_episode_rewards),
                       np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
예제 #6
0
env = make_vec_envs(args.env_name,
                    args.seed + 1000,
                    1,
                    None,
                    None,
                    args.add_timestep,
                    device='cpu',
                    allow_early_resets=False,
                    args=args)

# Get a render function
# render_func = get_render_func(env)

# We need to use the same statistics for normalization as used in training
actor_critic = Policy(env.observation_space.shape, env.action_space, args=args)

torch.nn.Module.dump_patches = True
actor_critic, ob_rms = \
            torch.load(os.path.join(args.load_dir, args.env_name + ".pt"))
if args.active_column is not None:
    actor_critic.base.active_column = args.active_column
    actor_critic.base.global_drop = True
vec_norm = get_vec_normalize(env)
if vec_norm is not None:
    vec_norm.eval()
    vec_norm.ob_rms = ob_rms

recurrent_hidden_states = torch.zeros(1,
                                      actor_critic.recurrent_hidden_state_size)
masks = torch.zeros(1, 1)
예제 #7
0
class PPO_agent:
    def __init__(self, params):
        self.params = params
        self.net = Policy(4, 2) # state_size, action_size
        #self.net = Policy(params.state_size, params.action_size) # state_size, action_size
        
        if self.params.cuda:
            print("network is moved to cuda")
            self.net.cuda()
        
        self.optimizer = Adam(self.net.parameters(), lr = params.lr)
    
    # this method select action given input state
    # it return log_prob, value and action for given state
    # acording to current policy
    def select_action(self, state):
        state = torch.FloatTensor(state).to(device)
        action, log_prob, value = self.net.select_action(state)
        
        return action, log_prob, value
    
    # calculating surrogate function for PPO
    # - averaging advantage
    # - calcualting log_prob(s,a)
    # - calculating log_prob/old_log_prob
    # - clipping above ratio
    # - taking min value from clipped/no-clipped ratios (policy_loss)
    # - calculating losses : value_loss/entropy_loss
    # - backpropagating with given losses
    # - optimizing one step
    def evaluate_data(self, experience):
        # unpacking given experience data
        states, actions, rewards, dones, old_log_probs, values, gae_returns = experience
        
        # averaging advantage with baseline (value)
        advantages = gae_returns - values
        advantages = (advantages - advantages.mean())/(advantages.std() + 01e-5)
        
        # chaging all data into tensors of size (-1, 1)
        states = torch.FloatTensor(states).view(-1, 1).to(device)
        actions = torch.FloatTensor(actions).view(-1, 1).to(device)
        #old_log_probs = torch.FloatTensor(old_log_probs).view(-1, 1).to(device)
        advantages = torch.FloatTensor(advantages).view(-1, 1).to(device)
        gae_returns = torch.FloatTensor(gae_returns).view(-1, 1).to(device)
        values = torch.FloatTensor(values).view(-1, 1).to(device)
        
        #calculating new log_prob with given s,a
        new_log_probs, new_values, entropys = self.net.evaluate_inputs(states, actions)
        # calculating ratio
        ratio = torch.exp(new_log_probs - old_log_probs)
        ratio_without_clipping = ratio*advantage
        # clipping ratio
        clipped_ratio = torch.clamp(ratio, 1.0 - self.params.clipping_value, 1.0 + self.params.clipping_value)*advantage
        # taking min value from both ratios ( policy_loss )
        policy_loss = torch.min(ratio_without_clipping, clipped_ratio).mean()
        
        # calculation losses (returns - values)^2
        value_loss = (gae_returns - new_values).pow(2).mean()
        # entropy loss = entropy*scaling_value
        entropy_loss = entropys*self.params.entropy_beta
        
        # backpropagation
        # zeroing gradient
        self.optimizer.zero_grad()
        # backpropagation
        (policy_loss + value_loss + entropy_loss).backward()
        # clipping gradinet ( TO DO )
        # optipmizer step to apply gradient
        self.optimizer.step()
예제 #8
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        vizz = Visdom(port=args.port)
        win = None
        winloss = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)
    #Initialize bw Model
    if args.bw:
        bw_model = bw_module(actor_critic, args, agent.optimizer,
                             envs.action_space, envs.observation_space)
    vis_timesteps = []
    vis_loss = []

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

            # Add stuff to the Buffer
            if args.bw:
                bw_model.step(rollouts.obs[step].detach().cpu().numpy(),
                              action.detach().cpu().numpy(),
                              reward.detach().cpu().numpy(), done,
                              obs.detach().cpu().numpy())

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # Do BW STEPS
        if args.bw and (j % args.n_a2c == 0):
            if not args.consistency:
                l_bw, l_imi = 0.0, 0.0
                for _ in range(args.n_bw):
                    l_bw += bw_model.train_bw_model(j)
                l_bw /= args.n_bw
                for _ in range(args.n_imi):
                    l_imi += bw_model.train_imitation(j)
                l_imi /= args.n_imi
            else:
                l_bw, l_fw = 0.0, 0.0
                for _ in range(args.n_bw):
                    l_bw_, l_fw_ = bw_model.train_bw_model(j)
                    l_bw += l_bw_
                    l_fw += l_fw_
                l_bw /= args.n_bw
                l_fw /= args.n_bw
                l_imi, l_cons = 0.0, 0.0
                for _ in range(args.n_imi):
                    l_imi_, l_cons_ = bw_model.train_imitation(j)
                    l_imi += l_imi_
                    l_cons_ += l_cons_
                l_imi /= args.n_imi
                l_cons /= args.n_imi

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                env_name = args.env_name
                if args.bw:
                    env_name += 'BW'
                win = visdom_plot(viz, win, args.log_dir, env_name, args.algo,
                                  args.num_frames)
            except IOError:
                pass

        # Save to Visdom Plots
        if args.vis and (j % args.vis_interval == 0):
            if args.bw and args.consistency:
                vis_loss.append(
                    [value_loss, action_loss, l_bw, l_imi, l_fw, l_cons])
                legend = [
                    'Value loss', 'Action loss', 'BW Loss', 'IMI loss',
                    'FW Loss', 'CONST loss'
                ]
                title = args.env_name + '-' + 'bw' + '-' + 'consistency' + args.title
            elif args.bw:
                vis_loss.append([value_loss, action_loss, l_bw, l_imi])
                legend = ['Value loss', 'Action loss', 'BW Loss', 'IMI loss']
                title = args.env_name + '-' + 'bw' + args.title
            else:
                vis_loss.append([value_loss, action_loss])
                legend = ['Value loss', 'Action loss']
                title = args.env_name + '-' + 'vanilla'
            vis_timesteps.append(
                (j + 1) * (args.num_processes * args.num_steps))
            # vis_rewards.append(final_rewards.mean())
            # vis_rewards.append(np.mean(reward_queue))

            # if win is None:
            #     win = vizz.line(Y=np.array(vis_rewards), X=np.array(vis_timesteps), opts=dict(title=title, xlabel='Timesteps',
            #                 ylabel='Avg Rewards'))
            # vizz.line(Y=np.array(vis_rewards), X=np.array(vis_timesteps), win=win, update='replace', opts=dict(title=title, xlabel='Timesteps',
            #                 ylabel='Avg Rewards'))
            if winloss is None:
                winloss = vizz.line(Y=np.array(vis_loss),
                                    X=np.array(vis_timesteps),
                                    opts=dict(title=title,
                                              xlabel='Timesteps',
                                              ylabel='Losses',
                                              legend=legend))
            vizz.line(Y=np.array(vis_loss),
                      X=np.array(vis_timesteps),
                      win=winloss,
                      update='replace',
                      opts=dict(title=title,
                                xlabel='Timesteps',
                                ylabel='Losses',
                                legend=legend))
예제 #9
0
def main():
    torch.set_num_threads(1)
    envs = make_vec_envs(args_env_name, args_seed, args_num_processes)
    actor_critic = Policy(envs.observation_space.shape, envs.action_space)
    agent = PPO(actor_critic,
                args_clip_param,
                args_ppo_epoch,
                args_num_mini_batch,
                args_value_loss_coef,
                args_entropy_coef,
                lr=args_lr,
                eps=args_eps,
                max_grad_norm=args_max_grad_norm)
    rollouts = RolloutStorage(args_num_steps, args_num_processes,
                              envs.observation_space.shape, envs.action_space)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)

    num_updates = int(
        args_num_env_steps) // args_num_steps // args_num_processes

    episode_rewards = deque(maxlen=10)
    start = time.time()
    sum_re = torch.zeros(args_num_processes, 1)

    for j in range(num_updates):

        for step in range(args_num_steps):
            with torch.no_grad():
                value, action, action_log_prob\
                    = actor_critic.act(rollouts.obs[step])

            obs, reward, done, infos = envs.step(action)
            sum_re += reward

            if any(done):

                for i in range(len(done)):
                    if done[i]:
                        episode_rewards.append(sum_re[i].item())
                        sum_re[i] *= 0
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, action, action_log_prob, value, reward, masks,
                            bad_masks)
        with torch.no_grad():

            next_value = actor_critic.get_value(rollouts.obs[-1])

        rollouts.compute_returns(next_value, args_gamma)
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        rollouts.after_update()

        if j % args_log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args_num_processes * args_num_steps
            end = time.time()
            print(
                "E {}, N_steps {}, FPS {}"
                " mean/median {:.1f}/{:.1f}, min/max {:.1f}/{:.1f} Ent {:.4f},V {:.4f},A {:.4f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        np.mean(episode_rewards), np.median(episode_rewards),
                        np.min(episode_rewards), np.max(episode_rewards),
                        dist_entropy, value_loss, action_loss))
예제 #10
0
    #############################################
    test_pf = True
    action = np.full((parallel_env), 1)
    total_gym_time = 0
    total_vitis_time = 0
    total_openCL_time = 0

    test_data = RL_data()

    list_rewards = []
    list_episodes = []
    rew_list = []
    obs_list = []
    iteration = 0

    policy = Policy()
    policy.load_state_dict(torch.load('params_6_15.ckpt'))
    policy.eval()

    opt = torch.optim.Adam(policy.parameters(), lr=1e-3)

    w1nparray = policy.layers[0].weight.detach().numpy()[:, :]
    w1nparray_part = w1nparray.reshape((4, 128, 12000))
    w2nparray = policy.layers[2].weight.detach().numpy()

    obs = env.reset()
    # policy=policy.to(device)

    test_data.doneVec = np.full((parallel_env), False)
    rew = np.full(shape=(parallel_env), fill_value=0, dtype=float)
예제 #11
0
파일: main.py 프로젝트: mklissa/phi_gcn
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    run_id = "alpha{}".format(args.gcn_alpha)
    if args.use_logger:
        from utils import Logger
        folder = "{}/{}".format(args.folder, run_id)
        logger = Logger(algo_name=args.algo,
                        environment_name=args.env_name,
                        folder=folder,
                        seed=args.seed)
        logger.save_args(args)

        print("---------------------------------------")
        print('Saving to', logger.save_folder)
        print("---------------------------------------")

    else:
        print("---------------------------------------")
        print('NOTE : NOT SAVING RESULTS')
        print("---------------------------------------")
    all_rewards = []

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          args.env_name,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size,
                              actor_critic.base.output_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    ############################
    # GCN Model and optimizer
    from pygcn.train import update_graph
    from pygcn.models import GCN
    gcn_model = GCN(nfeat=actor_critic.base.output_size, nhid=args.gcn_hidden)
    gcn_model.to(device)
    gcn_optimizer = optim.Adam(gcn_model.parameters(),
                               lr=args.gcn_lr,
                               weight_decay=args.gcn_weight_decay)
    gcn_loss = nn.NLLLoss()
    gcn_states = [[] for _ in range(args.num_processes)]
    Gs = [nx.Graph() for _ in range(args.num_processes)]
    node_ptrs = [0 for _ in range(args.num_processes)]
    rew_states = [[] for _ in range(args.num_processes)]
    ############################

    episode_rewards = deque(maxlen=100)
    avg_fwdloss = deque(maxlen=100)
    rew_rms = RunningMeanStd(shape=())
    delay_rew = torch.zeros([args.num_processes, 1])
    delay_step = torch.zeros([args.num_processes])

    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob,\
                 recurrent_hidden_states, hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            delay_rew += reward
            delay_step += 1

            for idx, (info, hid,
                      eps_done) in enumerate(zip(infos, hidden_states, done)):

                if eps_done or delay_step[idx] == args.reward_freq:
                    reward[idx] = delay_rew[idx]
                    delay_rew[idx] = delay_step[idx] = 0
                else:
                    reward[idx] = 0

                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

                if args.gcn_alpha < 1.0:
                    gcn_states[idx].append(hid)
                    node_ptrs[idx] += 1
                    if not eps_done:
                        Gs[idx].add_edge(node_ptrs[idx] - 1, node_ptrs[idx])
                    if reward[idx] != 0. or eps_done:
                        rew_states[idx].append(
                            [node_ptrs[idx] - 1, reward[idx]])
                    if eps_done:
                        adj = nx.adjacency_matrix(Gs[idx]) if len(Gs[idx].nodes)\
                                        else sp.csr_matrix(np.eye(1,dtype='int64'))
                        update_graph(gcn_model, gcn_optimizer,
                                     torch.stack(gcn_states[idx]), adj,
                                     rew_states[idx], gcn_loss, args, envs)
                        gcn_states[idx] = []
                        Gs[idx] = nx.Graph()
                        node_ptrs[idx] = 0
                        rew_states[idx] = []

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks,
                            hidden_states)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau, gcn_model, args.gcn_alpha)
        agent.update(rollouts)
        rollouts.after_update()

        ####################### Saving and book-keeping #######################
        if (j % int(num_updates / 5.) == 0
                or j == num_updates - 1) and args.save_dir != "":
            print('Saving model')
            print()

            save_dir = "{}/{}/{}".format(args.save_dir, args.folder, run_id)
            save_path = os.path.join(save_dir, args.algo, 'seed' +
                                     str(args.seed)) + '_iter' + str(j)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            save_gcn = gcn_model
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
                save_gcn = copy.deepcopy(gcn_model).cpu()

            save_model = [
                save_gcn, save_model,
                hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + "ac.pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {}\
             training episodes: mean/median reward {:.2f}/{:.2f},\
              min/max reward {:.2f}/{:.2f}, success rate {:.2f}, avg fwdloss {:.2f}\n"
                  .format(
                      j,
                      total_num_steps,
                      int(total_num_steps / (end - start)),
                      len(episode_rewards),
                      np.mean(episode_rewards),
                      np.median(episode_rewards),
                      np.min(episode_rewards),
                      np.max(episode_rewards),
                      np.count_nonzero(np.greater(episode_rewards, 0)) /
                      len(episode_rewards),
                      np.mean(avg_fwdloss),
                  ))

            all_rewards.append(np.mean(episode_rewards))
            if args.use_logger:
                logger.save_task_results(all_rewards)
        ####################### Saving and book-keeping #######################

    envs.close()
예제 #12
0
파일: agent.py 프로젝트: Kavka1/RL
class TD3_agent(object):
    def __init__(self, config: Dict) -> None:
        super(TD3_agent).__init__()

        self.lr = config['lr']
        self.gamma = config['gamma']
        self.tau = config['tau']
        self.noise_std = config['noise_std']
        self.noise_clip = config['noise_clip']
        self.a_max = config['a_max']
        self.a_min = config['a_min']
        self.batch_size = config['batch_size']
        self.update_delay = config['update_delay']
        self.device = torch.device(config['device'])

        self.policy = Policy(config).to(self.device)
        self.policy_target = Policy(config).to(self.device)

        self.twin_q = Twin_Q(config).to(self.device)
        self.twin_q_target = Twin_Q(config).to(self.device)

        self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.optimizer_q = optim.Adam(self.twin_q.parameters(), lr=self.lr)

        self.policy_target.load_state_dict(self.policy.state_dict())
        self.twin_q_target.load_state_dict(self.twin_q.state_dict())

        self.memory = Memory(config['memory_size'])

    def choose_action(self, obs: np.array, use_noise: bool = True) -> np.array:
        obs = torch.from_numpy(obs).to(self.device).float()
        with torch.no_grad():
            action = self.policy(obs)
            if use_noise:
                noise = torch.randn_like(action,
                                         dtype=torch.float,
                                         device=self.device) * self.noise_std
                action = action + noise
        action = np.clip(action.cpu().numpy(), self.a_min, self.a_max)
        return action

    def get_target_action(self, next_obs_batch: torch.tensor) -> torch.tensor:
        target_action = self.policy_target(next_obs_batch)
        noise = (torch.randn_like(target_action) * self.noise_std).clamp(
            -self.noise_clip, self.noise_clip)
        return target_action + noise

    def update_critic(self, obs_batch: torch.tensor, a_batch: torch.tensor,
                      next_obs_batch: torch.tensor, r_batch: torch.tensor,
                      done_batch: torch.tensor) -> float:
        next_action_target = self.get_target_action(next_obs_batch)
        Q1_next, Q2_next = self.twin_q_target(next_obs_batch,
                                              next_action_target)
        Q_target = r_batch + (1 - done_batch) * self.gamma * torch.min(
            Q1_next, Q2_next)
        Q1_predict, Q2_predict = self.twin_q(obs_batch, a_batch)

        loss_critic = F.mse_loss(Q1_predict, Q_target) + F.mse_loss(
            Q2_predict, Q_target)
        self.optimizer_q.zero_grad()
        loss_critic.backward()
        self.optimizer_q.step()

        return loss_critic.item()

    def update_actor(self, obs_batch: torch.tensor) -> float:
        loss_actor = -self.twin_q.Q1_value(obs_batch,
                                           self.policy(obs_batch)).mean()
        self.optimizer_pi.zero_grad()
        loss_actor.backward()
        self.optimizer_pi.step()

        return loss_actor.item()

    def update_target(self) -> None:
        soft_update(self.policy, self.policy_target, self.tau)
        soft_update(self.twin_q, self.twin_q_target, self.tau)

    def update(self, step: int) -> None:
        batch = self.memory.sample(batch_size=self.batch_size)
        o, a, r, o_, done = batch
        o = torch.from_numpy(np.array(o)).to(self.device).float()
        a = torch.from_numpy(np.array(a)).to(self.device).float()
        r = torch.from_numpy(np.array(r)).to(self.device).float()
        o_ = torch.from_numpy(np.array(o_)).to(self.device).float()
        done = torch.from_numpy(np.array(done)).to(self.device).int()

        loss_critic = self.update_critic(o, a, r, o_, done)
        loss_actor = 0.

        if step % self.update_delay == 0:
            loss_actor = self.update_actor(o)
            self.update_target()

        return loss_actor, loss_critic

    def save_transition(self, transition: List) -> None:
        self.memory.save_trans(transition)
예제 #13
0
def main():
    import copy
    import glob
    import os
    import time
    import matplotlib.pyplot as plt

    import gym
    import numpy as np
    import torch
    torch.multiprocessing.set_start_method('spawn')

    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim
    from gym.spaces import Discrete

    from arguments import get_args
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from envs import make_env
    from img_env_corner import ImgEnv, IMG_ENVS
    from model import Policy
    from storage import RolloutStorage
    from utils import update_current_obs, eval_episode
    from torchvision import transforms
    from visdom import Visdom

    import algo

    viz = Visdom(port=8097)

    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    plot_rewards = []
    plot_policy_loss = []
    plot_value_loss = []
    # x = np.array([0])
    # y = np.array([0])
    # counter = 0
    # win = viz.line(
    #     X=x,
    #     Y=y,
    #     win="test1",
    #     name='Line1',
    #     opts=dict(
    #         title='Reward',
    #     )
    #     )
    # win2 = viz.line(
    #     X=x,
    #     Y=y,
    #     win="test2",
    #     name='Line2',
    #     opts=dict(
    #         title='Policy Loss',
    #     )
    #     )
    # win3 = viz.line(
    #     X=x,
    #     Y=y,
    #     win="test3",
    #     name='Line3',
    #     opts=dict(
    #         title='Value Loss',
    #     )
    #     )

    args = get_args()
    if args.no_cuda:
        args.cuda = False
    print(args)
    assert args.algo in ['a2c', 'ppo', 'acktr']
    if args.recurrent_policy:
        assert args.algo in ['a2c', 'ppo'], \
            'Recurrent policy is not implemented for ACKTR'

    num_updates = int(args.num_frames) // args.num_steps // args.num_processes

    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    toprint = ['seed', 'lr', 'nat', 'resnet']
    if args.env_name in IMG_ENVS:
        toprint += ['window', 'max_steps']
    toprint.sort()
    name = args.tag
    args_param = vars(args)
    os.makedirs(os.path.join(args.out_dir, args.env_name), exist_ok=True)
    for arg in toprint:
        if arg in args_param and (args_param[arg] or arg in ['gamma', 'seed']):
            if args_param[arg] is True:
                name += '{}_'.format(arg)
            else:
                name += '{}{}_'.format(arg, args_param[arg])
    model_dir = os.path.join(args.out_dir, args.env_name, args.algo)
    os.makedirs(model_dir, exist_ok=True)

    results_dict = {
        'episodes': [],
        'rewards': [],
        'args': args
    }
    torch.set_num_threads(1)
    eval_env = make_env(args, 'cifar10', args.seed, 1, None,
            args.add_timestep, natural=args.nat, train=False)
    envs = make_env(args, 'cifar10', args.seed, 1, None,
            args.add_timestep, natural=args.nat, train=True)
                
    #print(envs)
    # envs = envs[0]
    

    # if args.num_processes > 1:
    #     envs = SubprocVecEnv(envs)
    # else:
    #     envs = DummyVecEnv(envs)
    # eval_env = DummyVecEnv(eval_env)
    # if len(envs.observation_space.shape) == 1:
    #     envs = VecNormalize(envs, gamma=args.gamma)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy,
                          dataset=args.env_name, resnet=args.resnet,
                          pretrained=args.pretrained)
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    action_space = envs.action_space
    if args.env_name in IMG_ENVS:
        action_space = np.zeros(2)
    # obs_shape = envs.observation_space.shape
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    obs = envs.reset()
    update_current_obs(obs, current_obs, obs_shape, args.num_stack)
    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        # envs.display_original(j)
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                        rollouts.observations[step],
                        rollouts.states[step],
                        rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)

            # envs.display_step(step, j)

            # print("OBS", obs)

            # print("REWARD", reward)
            # print("DONE", done)
            # print("INFO", info)


            reward = torch.from_numpy(np.expand_dims(np.stack([reward]), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in [done]])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs, current_obs, obs_shape, args.num_stack)
            rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks)

            # print("envs.curr_img SHAPE: ", envs.curr_img.shape)
            #display_state = envs.curr_img
            # display_state[:, envs.pos[0]:envs.pos[0]+envs.window, envs.pos[1]:envs.pos[1]+envs.window] = 5
            # display_state = custom_replace(display_state, 1, 0)
            # display_state[:, envs.pos[0]:envs.pos[0]+envs.window, envs.pos[1]:envs.pos[1]+envs.window] = \
            #     envs.curr_img[:, envs.pos[0]:envs.pos[0]+envs.window, envs.pos[1]:envs.pos[1]+envs.window]
            # img = transforms.ToPILImage()(display_state)
            # img.save("state_cifar/"+"state"+str(j)+"_"+str(step)+".png")

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        

        if j % args.save_interval == 0:
            torch.save((actor_critic.state_dict(), results_dict), os.path.join(
                model_dir, name + 'cifar_model_ppo_ex1_corner.pt'))

        if j % args.log_interval == 0:
            end = time.time()
            total_reward = eval_episode(eval_env, actor_critic, args)

            

            results_dict['rewards'].append(total_reward)
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, reward {:.1f} entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       np.mean(results_dict['rewards'][-10:]), dist_entropy,
                       value_loss, action_loss))


            plot_rewards.append(np.mean(results_dict['rewards'][-10:]))
            plot_policy_loss.append(action_loss)
            plot_value_loss.append(value_loss)


    plt.plot(range(len(plot_rewards)), plot_rewards)
    plt.savefig("rewards_corner.png")
    plt.close()

    
    plt.plot(range(len(plot_policy_loss)), plot_policy_loss)
    plt.savefig("policyloss_corner.png")
    plt.close()

    
    plt.plot(range(len(plot_value_loss)), plot_value_loss)
    plt.savefig("valueloss_corner.png")
    plt.close()
예제 #14
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cpu")

    # if args.vis:
    #     from visdom import Visdom
    #     viz = Visdom(port=args.port)
    #     win = None

    # envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
    #                     args.gamma, args.log_dir, args.add_timestep, device, False)

    observation_space = Box(low=0, high=10000, shape=(26,), dtype=np.float32)  # Box(84,84,4)
    action_space = Discrete(7)  # Discrete(4)

    actor_critic = Policy(observation_space.shape, action_space, base_kwargs={'recurrent': None})
    actor_critic.to(device)

    # if args.algo == 'a2c':
    #     agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
    #                            args.entropy_coef, lr=args.lr,
    #                            eps=args.eps, alpha=args.alpha,
    #                            max_grad_norm=args.max_grad_norm)
    # elif args.algo == 'ppo':
    #     agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
    #                      args.value_loss_coef, args.entropy_coef, lr=args.lr,
    #                            eps=args.eps,
    #                            max_grad_norm=args.max_grad_norm)
    # elif args.algo == 'acktr':
    agent = algo.A2C_ACKTR(actor_critic, value_loss_coef=0.1,
                           entropy_coef=0.01, acktr=True)

    rollouts = RolloutStorage(8000, 1, observation_space.shape, action_space, actor_critic.recurrent_hidden_state_size)

    obs = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
    rollouts.obs[0].copy_(torch.Tensor(obs))
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    f = open('poktr_rtmdp_20_2.txt', 'w')
    f.write("\noriginal loss(schedule 6 packets):")
    start = time.time()
    for j in range(num_updates):  # num_updates
        net = Net()
        node_list, path_list = net.read_graph(net.node_list, net.path_list)
        startnode = node_list[0]  # 起始节点
        net.get_data(startnode)
        count = 0
        remove_count = 0  # 记录丢弃的数据包的值
        end_time = startnode.messages[0].end_time
        pre_action_item = random.randint(0, 6)
        pre_action_item_oh = convert_one_hot(pre_action_item, 7)
        s = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, end_time, pre_action_item_oh]
        states = [[0], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]  # 用来存储所有节点状态
        ep_r = 0
        ep_acc_r = 0
        obs[:] = s
        reward_ten = torch.Tensor(1, 1)

        pre_value = torch.FloatTensor([[0.1]])
        pre_action = torch.Tensor([[random.randint(0, 6)]])
        pre_action_log_prob = torch.FloatTensor([[-1.]])
        pre_recurrent_hidden_states = torch.FloatTensor([[0.]])
        pre_masks = torch.FloatTensor([[0.]])
        for step in range(8000):
            # Sample actions
            count += 1
            old_action_log_prob = torch.Tensor([[0]])
            # print(rollouts, rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step])
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])
                action_item = action.item()  # 将Tensor类型的数据转化为Int型
                action_item_oh = convert_one_hot(action_item, 7)

            # Obser reward and next obs
            obs, reward, done, states, remove_count, acc_r, su_packets = net.schedule(pre_action_item, count, states, node_list, path_list,
                                                                            remove_count)

            ep_r += reward
            ep_acc_r += acc_r
            reward_ten[[0]] = reward
            # for info in infos:
            #     if 'episode' in info.keys():
            #         episode_rewards.append(info['episode']['r'])
            obs.extend(pre_action_item_oh)
            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done else [1.0]])
            # print((obs), recurrent_hidden_states, torch.Tensor(action), type(action_log_prob), type(value), type(reward), type(masks))
            rollouts.insert(torch.Tensor(obs), recurrent_hidden_states, action, action_log_prob, value, reward_ten, masks)
            # rollouts.insert(torch.Tensor(obs), pre_recurrent_hidden_states, pre_action, pre_action_log_prob, pre_value, reward_ten, pre_masks)

            pre_action = action
            pre_action_item = action_item
            pre_action_log_prob = action_log_prob
            pre_recurrent_hidden_states = recurrent_hidden_states
            pre_value = value
            pre_action_item_oh = convert_one_hot(pre_action_item, 7)

        f.write("\ntime:"+str(time.strftime('%H:%M:%S', time.localtime(time.time())))+"|"+str(j)+"|ep_r:"+str(ep_r)+"|pakcets:"+str(su_packets)+"|remove:"+str(remove_count)+"|ep_acc_r:"+str(ep_acc_r / 8000))
        f.flush()
        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, False, 0.99, 0.95)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        print("time:", time.strftime('%H:%M:%S', time.localtime(time.time())), "|", j, "|ep_r:", ep_r, "|pakcets:",
              su_packets, "|remove:", remove_count, "|ep_acc_r:", ep_acc_r / 8000, "|value_loss:", value_loss,
              "|action_loss:", action_loss, "|entropy:", dist_entropy)
        rollouts.after_update()
예제 #15
0
파일: main.py 프로젝트: Bruno-33/processor
def main():
    torch.set_num_threads(1)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    # Environment stuffs

    envs = []
    for i in range(args.num_processes):
        if args.scene_dir:
            scene_dir = os.path.join(args.scene_dir,
                                     "seed{}".format(args.seed + i))
            assert os.path.exists(scene_dir)
        else:
            scene_dir = None
        envs.append(
            make_env(args.env_name, args.seed, i, log_path, args.add_timestep,
                     scene_dir))

    # Hack infomation of gym environment
    tmp_env = envs[0]()
    sensor_type = tmp_env.unwrapped.hp_sensing_mode
    num_agent = tmp_env.unwrapped.hp_uav_n
    dim = tmp_env.unwrapped.hp_dim
    # Shape of o_env for each agent, required by the observation feature extraction module of the model
    if sensor_type == "lidar":
        atom_o_env_shape = tmp_env.unwrapped.hp_lidar_n + dim
    elif sensor_type == "pos":
        atom_o_env_shape = (dim + 1) * tmp_env.unwrapped.hp_n_nearest_obs
    else:
        raise Exception(
            "No implementation for sensing mode {}".format(sensor_type))

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if not args.unordered:
            envs = VecNormalize(
                envs, gamma=args.gamma
            )  # Different observation normalization factors for different agents
        else:
            envs = VecNormalize(envs, gamma=args.gamma, num_agent=num_agent)

    num_subagents = num_agent if args.indep else 1  # The way you view the robot team (i.e., a virtual structure or many robots)
    obs_shape = envs.observation_space.shape
    atom_obs_shape = (obs_shape[0] // num_subagents * args.num_stack,
                      *obs_shape[1:])  # Shape for each logical agent

    action_shape = envs.action_space.shape
    atom_action_shape = (action_shape[0] // num_subagents, *action_shape[1:])

    # Agent stuffs (core elements of PPO)

    if args.load_dir:  # Resume from breakpoint
        print("Loading model parameters from: " + args.load_dir)
        actor_critic, ob_rms, ret_rms = torch.load(args.load_dir)
        assert envs.ob_rms.mean.shape == ob_rms.mean.shape, "Mismatched observation shape, which may be induced by wrong flags (e.g., --unordered / --num_stack)"
        envs.ob_rms = ob_rms
        envs.ret_rms = ret_rms
    else:
        actor_critic = Policy(atom_obs_shape, atom_action_shape, sensor_type,
                              atom_o_env_shape, dim, num_agent, args.unordered,
                              args.indep, args.sigmoid, args.share,
                              args.no_rnn)

    if args.cuda:
        actor_critic.cuda()

    agent = algo.PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.value_loss_coef,
                     args.entropy_coef,
                     lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    rollouts = [
        RolloutStorage(args.num_steps, args.num_processes, atom_obs_shape,
                       atom_action_shape, actor_critic.state_size)
        for _ in range(num_subagents)
    ]

    # Auxiliary stuffs

    current_obs = [
        torch.zeros(args.num_processes, *atom_obs_shape)
        for _ in range(num_subagents)
    ]

    # Stack sequent observations to get current_obs, using the trick of reshaping.
    #
    # current_obs
    # Index         |1           |2           |3
    # Observation   |a1 a2 a3    |b1 b2 b3    |c1 c2 c3
    def update_current_obs(obs, idx):
        nonlocal current_obs
        shape_dim0 = atom_obs_shape[0] // args.num_stack
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[idx][:, :-shape_dim0] = current_obs[idx][:,
                                                                 shape_dim0:]
        current_obs[idx][:, -shape_dim0:] = obs

    obs = envs.reset()
    for i in range(num_subagents):
        update_current_obs(
            obs[:, i * atom_obs_shape[0]:(i + 1) * atom_obs_shape[0]], i)
        rollouts[i].observations[0].copy_(current_obs[i])

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        for i in range(num_subagents):
            current_obs[i] = current_obs[i].cuda()
            rollouts[i].cuda()

    # Main loop

    train_start = datetime.datetime.now()
    print("Training starts at: {}".format(train_start))
    env_time = 0.  # time cost of interaction with environment
    env_compute_time = 0.
    env_step_time = 0.
    env_rollout_time = 0.
    update_time = 0.  # time cost of updating parameters
    log_time = 0.  # time cost of logging

    for j in range(num_updates):
        # Interact with the environment

        start_env_time = time.time()  # Timer

        for step in range(args.num_steps):
            start_env_compute_time = time.time()

            # Sample actions
            with torch.no_grad():
                l_value, l_action, l_action_log_prob, l_states = [], [], [], []
                for i in range(num_subagents):
                    value, action, action_log_prob, states = actor_critic.act(
                        rollouts[i].observations[step],
                        rollouts[i].states[step], rollouts[i].masks[step])
                    l_value.append(value)
                    l_action.append(action)
                    l_action_log_prob.append(action_log_prob)
                    l_states.append(states)
                action = torch.cat(l_action, dim=1)

            cpu_actions = action.squeeze(1).cpu().numpy()

            env_compute_time += time.time() - start_env_compute_time

            start_env_step_time = time.time()

            obs, reward, done, info = envs.step(cpu_actions)

            env_step_time += time.time() - start_env_step_time

            start_env_rollout_time = time.time()

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            # final_rewards is the accumulated reward of the last trajectory, episode_rewards is an auxuliary variable.
            # The motivation is to enable logging in arbitrary time step.
            final_rewards *= masks
            final_rewards += (
                1 - masks
            ) * episode_rewards  # If not done, mask=1, final_rewards doesn't change
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            for i in range(num_subagents):
                current_obs[i] *= masks  # Useful when args.num_stack > 1
                update_current_obs(
                    obs[:, i * atom_obs_shape[0]:(i + 1) * atom_obs_shape[0]],
                    i)
                rollouts[i].insert(current_obs[i], l_states[i], l_action[i],
                                   l_action_log_prob[i], l_value[i], reward,
                                   masks)

            env_rollout_time += time.time() - start_env_rollout_time

        env_time += time.time() - start_env_time

        # Update parameters

        start_update_time = time.time()  # Timer

        for i in range(num_subagents):
            with torch.no_grad():
                next_value = actor_critic.get_value(
                    rollouts[i].observations[-1], rollouts[i].states[-1],
                    rollouts[i].masks[-1]).detach()

            rollouts[i].compute_returns(next_value, args.use_gae, args.gamma,
                                        args.tau)

            value_loss, action_loss, dist_entropy = agent.update(rollouts[i])

            rollouts[i].after_update()

        update_time += time.time() - start_update_time

        # Logging

        start_log_time = time.time()  # Timer

        # Save models
        if j % args.save_interval == 0 or j == num_updates - 1:
            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None,
                hasattr(envs, 'ret_rms') and envs.ret_rms or None
            ]

            torch.save(save_model,
                       os.path.join(model_path, "model" + str(j) + ".pt"))

        # For logging training information
        if j % args.log_interval == 0 or j == num_updates - 1:
            log_env_time = []
            for i, info_i in enumerate(info):
                log_reset_i = "            Average reset time for env{}: {:.1f}ms = {:.1f}h / {}".format(
                    i, info_i['reset_time'] * 1000 / info_i['reset_num'],
                    info_i['reset_time'] / 3600, info_i['reset_num'])
                log_step_i = "            Average step time for env{}: {:.1f}ms = {:.1f}h / {}".format(
                    i, info_i['step_time'] * 1000 / info_i['step_num'],
                    info_i['step_time'] / 3600, info_i['step_num'])
                log_env_time.append(log_reset_i)
                log_env_time.append(log_step_i)
            log_env_time = '\n'.join(log_env_time)

            current_time = datetime.datetime.now()

            summary = '\n'.join([
                "Training starts at: {}".format(train_start),
                "Current time: {}".format(current_time),
                "Elapsed time: {}".format(current_time - train_start),
                "    Environment interaction: {:.1f}h".format(
                    env_time / 3600), "        Compute action: {:.1f}h".format(
                        env_compute_time / 3600),
                "        Rollout: {:.1f}h".format(env_rollout_time / 3600),
                "        Interaction with gym: {:.1f}h".format(
                    env_step_time / 3600), log_env_time,
                "    Parameters update: {:.1f}h".format(update_time / 3600),
                "    logging: {:.1f}h".format(log_time / 3600)
            ]) + '\n'

            # Write down summary of the training
            with open(os.path.join(root_path, "summary.txt"), 'w') as f:
                f.write(summary)

        # For Visdom visualization
        if args.vis and (j % args.vis_interval == 0 or j == num_updates - 1):
            # Sometimes monitor doesn't properly flush the outputs
            win = visdom_plot(viz,
                              win,
                              args.vis_env,
                              log_path,
                              title,
                              args.algo,
                              args.num_frames,
                              save_dir=root_path)
            viz.save([args.vis_env])

        log_time += time.time() - start_log_time

    print(summary)
예제 #16
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    torch.set_num_threads(1)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                    rollouts.observations[step], rollouts.states[step],
                    rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states, action, action_log_prob,
                            value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(), dist_entropy,
                        value_loss, action_loss))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
예제 #17
0
파일: main.py 프로젝트: liuwenhaha/gym-city
    def __init__(self):
        import random
        import gym_city
        import game_of_life

        self.fieldnames = self.get_fieldnames()

        args = get_args()
        args.log_dir = args.save_dir + '/logs'
        assert args.algo in ['a2c', 'ppo', 'acktr']
        if args.recurrent_policy:
            assert args.algo in ['a2c', 'ppo'], \
                    'Recurrent policy is not implemented for ACKTR'

        num_updates = int(args.num_frames) // args.num_steps // args.num_processes

        torch.manual_seed(args.seed)
        if args.cuda:
            print('CUDA ENABLED')
            torch.cuda.manual_seed(args.seed)

        graph_name = args.save_dir.split('trained_models/')[1].replace('/', ' ')
        self.graph_name = graph_name

        actor_critic = False
        agent = False
        past_steps = 0
        try:
            os.makedirs(args.log_dir)
        except OSError:
            files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
            for f in files:
                if args.overwrite:
                    os.remove(f)
                else:
                    pass
        torch.set_num_threads(1)
        device = torch.device("cuda:0" if args.cuda else "cpu")
        self.device = device

        if args.vis:
            from visdom import Visdom
            viz = Visdom(port=args.port)
            self.viz = viz
            win = None
            self.win = win
            win_eval = None
            self.win_eval = win_eval
        if 'GameOfLife' in args.env_name:
            print('env name: {}'.format(args.env_name))
            num_actions = 1
        envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                args.gamma, args.log_dir, args.add_timestep, device, False, None,
                args=args)

        if isinstance(envs.observation_space, gym.spaces.Discrete):
            num_inputs = envs.observation_space.n
        elif isinstance(envs.observation_space, gym.spaces.Box):
            if 'golmulti' in args.env_name.lower():
                multi_env = True
                observation_space_shape = envs.observation_space.shape[1:]
            else:
                multi_env = False
                observation_space_shape = envs.observation_space.shape
            self.multi_env = multi_env
            if len(observation_space_shape) == 3:
                in_w = observation_space_shape[1]
                in_h = observation_space_shape[2]
            else:
                in_w = 1
                in_h = 1
            num_inputs = observation_space_shape[0]
        if isinstance(envs.action_space, gym.spaces.Discrete) or\
            isinstance(envs.action_space, gym.spaces.Box):
            out_w = args.map_width
            out_h = args.map_width
            if 'Micropolis' in args.env_name: #otherwise it's set
                if args.power_puzzle:
                    num_actions = 1
                else:
                    num_actions = 19 # TODO: have this already from env
            elif 'GameOfLife' in args.env_name:
                num_actions = 1
            else:
                num_actions = envs.action_space.n
        elif isinstance(envs.action_space, gym.spaces.Box):
            if len(envs.action_space.shape) == 3:
                out_w = envs.action_space.shape[1]
                out_h = envs.action_space.shape[2]
            elif len(envs.action_space.shape) == 1:
                out_w = 1
                out_h = 1
            num_actions = envs.action_space.shape[-1]
        print('num actions {}'.format(num_actions))

        if args.auto_expand:
            args.n_recs -= 1
        actor_critic = Policy(envs.observation_space.shape, envs.action_space,
            base_kwargs={'map_width': args.map_width, 'num_actions': num_actions,
                'recurrent': args.recurrent_policy, 'prebuild': args.prebuild,
                'in_w': in_w, 'in_h': in_h, 'num_inputs': num_inputs,
                'out_w': out_w, 'out_h': out_h},
                         curiosity=args.curiosity, algo=args.algo,
                         model=args.model, args=args)
        if args.auto_expand:
            args.n_recs += 1

        evaluator = None
        self.evaluator = evaluator

        if not agent:
            agent = init_agent(actor_critic, args)

        vec_norm = get_vec_normalize(envs)
        self.vec_norm = vec_norm
       #saved_model = os.path.join(args.save_dir, args.env_name + '.pt')
        if args.load_dir:
            saved_model = os.path.join(args.load_dir, args.env_name + '.tar')
        else:
            saved_model = os.path.join(args.save_dir, args.env_name + '.tar')
        self.checkpoint = None
        if os.path.exists(saved_model) and not args.overwrite:
            checkpoint = torch.load(saved_model)
            self.checkpoint = checkpoint
            saved_args = checkpoint['args']
            actor_critic.load_state_dict(checkpoint['model_state_dict'])
           #for o, l in zip(agent.optimizer.state_dict, checkpoint['optimizer_state_dict']):
           #    print(o, l)
           #print(agent.optimizer.state_dict()['param_groups'])
           #print('\n')
           #print(checkpoint['model_state_dict'])
            actor_critic.to(self.device)
           #actor_critic.cuda()
           #agent = init_agent(actor_critic, saved_args)
            agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            if args.auto_expand:
                if not args.n_recs - saved_args.n_recs == 1:
                    print('can expand by 1 rec only from saved model, not {}'.format(args.n_recs - saved_args.n_recs))
                    raise Exception
                actor_critic.base.auto_expand()
                print('expanded net: \n{}'.format(actor_critic.base))
            past_steps = checkpoint['past_steps']
            ob_rms = checkpoint['ob_rms']
            past_steps = next(iter(agent.optimizer.state_dict()['state'].values()))['step']
            print('Resuming from step {}'.format(past_steps))

           #print(type(next(iter((torch.load(saved_model))))))
           #actor_critic, ob_rms = \
           #        torch.load(saved_model)
           #agent = \
           #    torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt'))
           #if not agent.optimizer.state_dict()['state'].values():
           #    past_steps = 0
           #else:

           #    raise Exception

            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = ob_rms
            saved_args.num_frames = args.num_frames
            saved_args.vis_interval = args.vis_interval
            saved_args.eval_interval = args.eval_interval
            saved_args.overwrite = args.overwrite
            saved_args.n_recs = args.n_recs
            saved_args.intra_shr = args.intra_shr
            saved_args.inter_shr = args.inter_shr
            saved_args.map_width = args.map_width
            saved_args.render = args.render
            saved_args.print_map = args.print_map
            saved_args.load_dir = args.load_dir
            saved_args.experiment_name = args.experiment_name
            saved_args.log_dir = args.log_dir
            saved_args.save_dir = args.save_dir
            saved_args.num_processes = args.num_processes
            saved_args.n_chan = args.n_chan
            saved_args.prebuild = args.prebuild
            args = saved_args
        actor_critic.to(device)

        if 'LSTM' in args.model:
            recurrent_hidden_state_size = actor_critic.base.get_recurrent_state_size()
        else:
            recurrent_hidden_state_size = actor_critic.recurrent_hidden_state_size
        if args.curiosity:
            rollouts = CuriosityRolloutStorage(args.num_steps, args.num_processes,
                                envs.observation_space.shape, envs.action_space,
                                recurrent_hidden_state_size, actor_critic.base.feature_state_size(), args=args)
        else:
            rollouts = RolloutStorage(args.num_steps, args.num_processes,
                                envs.observation_space.shape, envs.action_space,
                                recurrent_hidden_state_size, args=args)

        obs = envs.reset()
        rollouts.obs[0].copy_(obs)
        rollouts.to(device)

        episode_rewards = deque(maxlen=10)

        start = time.time()
        self.model = model = actor_critic.base
        self.reset_eval = False
        plotter = None
        env_param_bounds = envs.get_param_bounds()
        # in case we want to change this dynamically in the future (e.g., we may
        # not know how much traffic the agent can possibly produce in Micropolis)
        envs.set_param_bounds(env_param_bounds) # start with default bounds

        if args.model == 'FractalNet' or args.model == 'fractal':
            n_cols = model.n_cols
            if args.rule == 'wide1' and args.n_recs > 3:
                col_step = 3
            else:
                col_step = 1
        else:
            n_cols = 0
            col_step = 1
        self.col_step = col_step
        env_param_bounds = envs.get_param_bounds()
        # in case we want to change this dynamically in the future (e.g., we may
        # not know how much traffic the agent can possibly produce in Micropolis)
        envs.set_param_bounds(env_param_bounds) # start with default bounds
        self.past_steps = past_steps
        self.num_updates = num_updates
        self.envs = envs
        self.start = start
        self.rollouts = rollouts
        self.args = args
        self.actor_critic = actor_critic
        self.plotter = plotter
        self.agent = agent
        self.episode_rewards = episode_rewards
        self.n_cols = n_cols
예제 #18
0
파일: agent.py 프로젝트: lyp741/Swarm-VDN
class Agent():
    def __init__(self, args):
        self.buffer_size = int(1e5)
        self.batch_size = 32
        self.num_agents = 0
        self.num_of_actions = 9
        self.model = []
        self.buffer = []
        self.time = 0
        self.gamma = 0.95
        self.episode_length = 10000
        self.args = args
        self.time_now = datetime.datetime.now().strftime('%Y-%m-%d')
        try:
            os.mkdir(self.time_now)
        except:
            pass

    def init(self, obs):
        self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        self.device = torch.device(self.device)
        self.num_agents = len(obs['image'])
        self.buffer = ReplayBuffer(self.buffer_size, self.batch_size, self.num_agents)
        self.model = Policy(self.num_of_actions).to(self.device)
        if self.args.model != 'None':
            self.load_model(self.args.model)
        # self.load_model('2019-07-09/30_160600')
        self.target = Policy(self.num_of_actions).to(self.device)
        self.update_target()
        self.optimizer = optim.Adam(self.model.parameters())
        self.last_state_cnn = np.zeros((self.num_agents,3,128,128))
        self.last_state_oth = np.zeros((self.num_agents, 11))
        self.last_action = np.zeros((self.num_agents, 1))

    def get_obs_cnn(self, obs):
        temp = []
        for i in range(len(obs["image"])):
            temp.append(np.r_[obs["image"][i]])
        temp = np.r_[temp]
        t = np.transpose(temp, (0,3,1,2))
        # t /= 255.0
        return t

    def get_obs_oth(self, obs):
        temp = []
        # change in another network structure
        for i in range(len(obs["ir"])):
            temp.append(np.r_[obs["ir"][i],
                              obs["gyro"][i],
                              obs["target"][i]])
        t = np.r_[temp]
        return t

    def get_new_cnn(self, t):
        t = np.concatenate((self.last_state_cnn, t), axis=1)
        return t

    def get_new_oth(self,t):
        t = np.concatenate((self.last_state_oth, t), axis=1)
        return t

    def update_target(self):
        self.target.load_state_dict(self.model.state_dict())

    def get_action(self, obs, epsilon, done):
        if self.num_agents == 0:
            self.init(obs)
        state_cnn = self.get_obs_cnn(obs)
        state_oth = self.get_obs_oth(obs)
        cat_cnn = self.get_new_cnn(state_cnn)
        cat_oth = self.get_new_oth(state_oth)
        q = self.model(cat_cnn,cat_oth)
        actions = q.max(1)[1]
        index_action = np.zeros((self.num_agents,), dtype=np.uint8)
        for i in range(self.num_agents):
            if random.random() > epsilon:
                index_action[i] = random.randint(0, self.num_of_actions - 1)
            else:
                index_action[i] = actions[i].item()

        if done.item(0) != True:
            self.last_state_cnn = state_cnn
            self.last_state_oth = state_oth
            self.last_action = index_action
        elif done.item(0) == True:
            self.last_state_cnn = np.zeros((self.num_agents,3, 128, 128))
            self.last_state_oth = np.zeros((self.num_agents, 11))
            self.last_action = np.zeros((self.num_agents, 1))
        return index_action

    def learn(self):
        self.time += 1
        if len(self.buffer) < self.batch_size*self.num_agents:
            return

        state_cnn, state_oth, action, reward, next_cnn, next_oth, done = self.buffer.sample()

        # max_q = self.target(next_cnn, next_oth).max(1)[0].unsqueeze(1)
        pred_q = self.model(state_cnn, state_oth)
        pred_q = pred_q.gather(1, action.view(-1).unsqueeze(1).long())
        target_chosen_actions = self.model(next_cnn, next_oth).max(1)[1].unsqueeze(1)
        max_q = self.target(next_cnn, next_oth).gather(1, target_chosen_actions)
        reward = reward.view(-1,1)

        true_q = reward + (1 - done) * self.gamma * max_q.detach()
        criterion = nn.MSELoss()
        loss = criterion(pred_q, true_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.model.reset_noise()
        self.target.reset_noise()
        if self.time % 10 == 0:
            self.update_target()
        if self.time % 100 == 0:
            self.save_model(self.time_now + '/' + str(self.num_agents) + '_' + str(self.time))

    def store_experience(self, obs, action, reward, done):
        state_cnn = self.get_obs_cnn(obs)
        state_oth = self.get_obs_oth(obs)
        self.buffer.add(state_cnn, state_oth, action, reward, done)

    def save_model(self, filename):
        # filename = './' + str(self.num_agents)
        torch.save(self.model.state_dict(), filename)

    def load_model(self, filename):
        # filename = './' + self.num_agents
        self.model.load_state_dict(torch.load(filename))
        self.model.eval()
예제 #19
0
class PPOCarla(Agent):
    def __init__(self,
                 obs_converter,
                 action_converter,
                 clip_param,
                 ppo_epoch,
                 num_mini_batch,
                 value_loss_coef,
                 entropy_coef,
                 lr=None,
                 eps=None,
                 max_grad_norm=None,
                 use_clipped_value_loss=False):

        self.obs_converter = obs_converter
        self.action_converter = action_converter
        self.model = Policy(
            self.obs_converter.get_observation_space(),
            self.action_converter.get_action_space()).to("cuda:0")

        self.clip_param = clip_param
        self.ppo_epoch = ppo_epoch
        self.num_mini_batch = num_mini_batch

        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef

        self.max_grad_norm = max_grad_norm
        self.use_clipped_value_loss = use_clipped_value_loss

        self.optimizer = optim.Adam(self.model.parameters(), lr=lr, eps=eps)

    def update(self, rollouts):
        advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-5)

        value_loss_epoch = 0
        action_loss_epoch = 0
        dist_entropy_epoch = 0

        for e in range(self.ppo_epoch):
            if self.model.is_recurrent:
                data_generator = rollouts.recurrent_generator(
                    advantages, self.num_mini_batch)
            else:
                data_generator = rollouts.feed_forward_generator(
                    advantages, self.num_mini_batch)

            for sample in data_generator:
                obs_batch, recurrent_hidden_states_batch, actions_batch, \
                   value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, \
                        adv_targ = sample

                # Reshape to do in a single forward pass for all steps
                values, action_log_probs, dist_entropy, _ = self.model.evaluate_actions(
                    obs_batch['img'], obs_batch['v'],
                    recurrent_hidden_states_batch, masks_batch, actions_batch)

                ratio = torch.exp(action_log_probs -
                                  old_action_log_probs_batch)
                surr1 = ratio * adv_targ
                surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                    1.0 + self.clip_param) * adv_targ
                action_loss = -torch.min(surr1, surr2).mean()

                if self.use_clipped_value_loss:
                    value_pred_clipped = value_preds_batch + \
                        (values - value_preds_batch).clamp(-self.clip_param, self.clip_param)
                    value_losses = (values - return_batch).pow(2)
                    value_losses_clipped = (value_pred_clipped -
                                            return_batch).pow(2)
                    value_loss = .5 * torch.max(value_losses,
                                                value_losses_clipped).mean()
                else:
                    value_loss = 0.5 * F.mse_loss(return_batch, values)

                self.optimizer.zero_grad()
                (value_loss * self.value_loss_coef + action_loss -
                 dist_entropy * self.entropy_coef).backward()
                nn.utils.clip_grad_norm_(self.model.parameters(),
                                         self.max_grad_norm)
                self.optimizer.step()

                value_loss_epoch += value_loss.item()
                action_loss_epoch += action_loss.item()
                dist_entropy_epoch += dist_entropy.item()

        num_updates = self.ppo_epoch * self.num_mini_batch

        value_loss_epoch /= num_updates
        action_loss_epoch /= num_updates
        dist_entropy_epoch /= num_updates

        return value_loss_epoch, action_loss_epoch, dist_entropy_epoch

    def act(self, inputs, rnn_hxs, masks, deterministic=False):
        eps_curr = 0.  # TODO: Change if you want to implement epsilon-greedy
        return self.model.act(inputs['img'],
                              inputs['v'],
                              rnn_hxs,
                              masks,
                              eps_curr,
                              deterministic=False)

    def get_value(self, inputs, rnn_hxs, masks):
        return self.model.get_value(inputs['img'], inputs['v'], rnn_hxs, masks)
예제 #20
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    """
    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
    """

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False, args.ep_max_step)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.useNeural:
        #FLAGS = update_tf_wrapper_args(args,)
        tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        tf_config.gpu_options.allow_growth = True
        sess = tf.Session(config=tf_config)
        pixel_bonus = PixelBonus(FLAGS, sess)
        tf.initialize_all_variables().run(session=sess)
        if args.loadNeural is not None:
            pixel_bonus.load_model(args.loadNeural)

        #with tf.variable_scope('step'):
        #    self.step_op = tf.Variable(0, trainable=False, name='step')
        #    self.step_input = tf.placeholder('int32', None, name='step_input')
        #    self.step_assign_op = self.step_op.assign(self.step_input)


    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.observation_space.shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size)


    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=100)

    steper =0
    img_scale = 1
    psc_weight = float(args.pscWeight)
    psc_rollout=list()

    start = time.time()
    for j in range(num_updates):
        step_counter = 0
        psc_tot=list()
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            psc_add = 0
            if args.useNeural:
                for i in obs[0]:
                    frame = imresize((i / img_scale).cpu().numpy(), (42, 42), order=1)
                    psc_add += pixel_bonus.bonus(i, steper)
                    steper += 1

                psc_add = psc_add / 12
            else:
                useNeural = 0

            psc_tot.append(psc_add)


            """
            for info in infos:
                if 'episode' in info.keys():
                    print(reward)
                    episode_rewards.append(info['episode']['r'])
            """

            # FIXME: works only for environments with sparse rewards
            for idx, eps_done in enumerate(done):
                if eps_done:
                    episode_rewards.append(reward[idx])

            psc_add=torch.tensor(psc_add,requires_grad=True, dtype = torch.float)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, psc=psc_add)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts, psc_tot, psc_weight)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            print('Saving model')
            print()

            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n".
                format(
                    j, total_num_steps,
                    int(total_num_steps / (end - start)),
                    len(episode_rewards),
                    np.mean(episode_rewards),
                    np.median(episode_rewards),
                    np.min(episode_rewards),
                    np.max(episode_rewards),
                    np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards)
                )
            )

        if args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0:
            eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes,
                                args.gamma, eval_log_dir, args.add_timestep, device, True)

            if eval_envs.venv.__class__.__name__ == "VecNormalize":
                eval_envs.venv.ob_rms = envs.venv.ob_rms

                # An ugly hack to remove updates
                def _obfilt(self, obs):
                    if self.ob_rms:
                        obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
                        return obs
                    else:
                        return obs

                eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv)

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(args.num_processes,
                            actor_critic.recurrent_hidden_state_size, device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards),
                np.mean(eval_episode_rewards)
            ))
    if useNeural:
        pixel_bonus.save_model(str(args.nameDemonstrator) + "neural", step)

        """
예제 #21
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    """
    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
    """

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False)

    actor_critic = Policy(envs.observation_space.shape, envs.action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)


    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.observation_space.shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=100)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            """
            for info in infos:
                if 'episode' in info.keys():
                    print(reward)
                    episode_rewards.append(info['episode']['r'])
            """

            # FIXME: works only for environments with sparse rewards
            for idx, eps_done in enumerate(done):
                if eps_done:
                    episode_rewards.append(reward[idx])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            print('Saving model')
            print()

            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n".
                format(
                    j, total_num_steps,
                    int(total_num_steps / (end - start)),
                    len(episode_rewards),
                    np.mean(episode_rewards),
                    np.median(episode_rewards),
                    np.min(episode_rewards),
                    np.max(episode_rewards),
                    np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards)
                )
            )
            with open(save_path+'/TrainingStats_file.csv', mode='a') as train_file:
                train_writer = csv.writer(train_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

                train_writer.writerow([j, total_num_steps,
                    int(total_num_steps / (end - start)),
                    len(episode_rewards),
                    np.mean(episode_rewards),
                    np.median(episode_rewards),
                    np.min(episode_rewards),
                    np.max(episode_rewards),
                    np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards)])

        if args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0:
            eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes,
                                args.gamma, eval_log_dir, args.add_timestep, device, True)

            if eval_envs.venv.__class__.__name__ == "VecNormalize":
                eval_envs.venv.ob_rms = envs.venv.ob_rms

                # An ugly hack to remove updates
                def _obfilt(self, obs):
                    if self.ob_rms:
                        obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
                        return obs
                    else:
                        return obs

                eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv)

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(args.num_processes,
                            actor_critic.recurrent_hidden_state_size, device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards),
                np.mean(eval_episode_rewards)
            ))

        """
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
        """

    envs.close()
예제 #22
0
    def __init__(self, args, im_log_dir):
        self.im_log_dir = im_log_dir
        self.log_dir = args.load_dir
        env_name = args.env_name
        if torch.cuda.is_available() and not args.no_cuda:
            args.cuda = True
            device = torch.device('cuda')
            map_location = torch.device('cuda')
        else:
            args.cuda = False
            device = torch.device('cpu')
            map_location = torch.device('cpu')
        try:
            checkpoint = torch.load(os.path.join(args.load_dir, env_name + '.tar'),
                                    map_location=map_location)
        except FileNotFoundError:
            print('load-dir does not start with valid gym environment id, using command line args')
            env_name = args.env_name
            checkpoint = torch.load(os.path.join(args.load_dir, env_name + '.tar'),
                                map_location=map_location)
        saved_args = checkpoint['args']
        past_frames = checkpoint['n_frames']
        args.past_frames = past_frames
        env_name = saved_args.env_name

        if 'Micropolis' in env_name:
            args.power_puzzle = saved_args.power_puzzle

        if not args.evaluate and not 'GoLMulti' in env_name:
            # assume we just want to observe/interact w/ a single env.
            args.num_proc = 1
        dummy_args = args
        envs = make_vec_envs(env_name, args.seed + 1000, args.num_processes,
                            None, args.load_dir, args.add_timestep, device=device,
                            allow_early_resets=False,
                            args=dummy_args)
        print(args.load_dir)

        if isinstance(envs.observation_space, gym.spaces.Discrete):
            in_width = 1
            num_inputs = envs.observation_space.n
        elif isinstance(envs.observation_space, gym.spaces.Box):
            if len(envs.observation_space.shape) == 3:
                in_w = envs.observation_space.shape[1]
                in_h = envs.observation_space.shape[2]
            else:
                in_w = 1
                in_h = 1
            num_inputs = envs.observation_space.shape[0]

        if isinstance(envs.action_space, gym.spaces.Discrete):
            out_w = 1
            out_h = 1
            num_actions = int(envs.action_space.n // (in_w * in_h))
           #if 'Micropolis' in env_name:
           #    num_actions = env.venv.venv.envs[0].num_tools
           #elif 'GameOfLife' in env_name:
           #    num_actions = 1
           #else:
           #    num_actions = env.action_space.n
        elif isinstance(envs.action_space, gym.spaces.Box):
            out_w = envs.action_space.shape[0]
            out_h = envs.action_space.shape[1]
            num_actions = envs.action_space.shape[-1]
        # We need to use the same statistics for normalization as used in training
        #actor_critic, ob_rms = \
        #            torch.load(os.path.join(args.load_dir, args.env_name + ".pt"))

        if saved_args.model == 'fractal':
            saved_args.model = 'FractalNet'
        actor_critic = Policy(envs.observation_space.shape, envs.action_space,
                base_kwargs={'map_width': args.map_width,
                             'recurrent': args.recurrent_policy,
                            'in_w': in_w, 'in_h': in_h, 'num_inputs': num_inputs,
                    'out_w': out_w, 'out_h': out_h },
                             curiosity=args.curiosity, algo=saved_args.algo,
                             model=saved_args.model, args=saved_args)
        actor_critic.to(device)
        torch.nn.Module.dump_patches = True
        actor_critic.load_state_dict(checkpoint['model_state_dict'])
        ob_rms = checkpoint['ob_rms']

        if 'fractal' in args.model.lower():
            new_recs = args.n_recs - saved_args.n_recs

            for nr in range(new_recs):
                actor_critic.base.auto_expand()
            print('expanded network:\n', actor_critic.base)

            if args.active_column is not None \
                    and hasattr(actor_critic.base, 'set_active_column'):
                actor_critic.base.set_active_column(args.active_column)
        vec_norm = get_vec_normalize(envs)

        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms
        self.actor_critic = actor_critic
        self.envs = envs
        self.args = args
예제 #23
0
def gen_frequencies():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, 1, args.gammas[-1], None,
                         args.add_timestep, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'recurrent': args.recurrent_policy,
                              'num_values': args.num_values,
                              'sum_values': args.sum_values
                          })

    state_dict = torch.load(args.log_dir + '/ppo/' + args.env_name + '.pt')
    actor_critic.load_state_dict(state_dict[0].state_dict())
    actor_critic.to(device)

    rollouts = RolloutStorage(1,
                              1,
                              envs.observation_space.shape,
                              envs.action_space,
                              actor_critic.recurrent_hidden_state_size,
                              tau=args.tau,
                              gammas=args.gammas,
                              use_delta_gamma=args.use_delta_gamma,
                              use_capped_bias=args.use_capped_bias)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = []
    values = []
    rewards = []
    NUM_STEPS = 10000
    total_num_rewards = 0
    for step in range(NUM_STEPS):

        with torch.no_grad():
            value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                obs, rollouts.recurrent_hidden_states[0], rollouts.masks[0])

            obs, reward, done, infos = envs.step(action)

            r = reward.item()
            if r > 0 or r < 0:
                total_num_rewards += 1

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])

            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

    with open(
            'learned_frequencies/' + args.env_name[:-14] +
            '_learned_reward_frequency.pkl', 'wb') as handle:
        pickle.dump(total_num_rewards / NUM_STEPS, handle)
예제 #24
0
파일: main.py 프로젝트: yuerxin1997/ToHRE
            print(pr_y[i])
            break     
    print("test auc_local: ", auc)
    print("p_4", p_4)
    return auc, p_4, pr_x, pr_y, test_result


if __name__ == "__main__":
    conf = config.Config()
    os.environ['CUDA_VISIBLE_DEVICES'] = conf.gpu
    conf.load_train_data()
    conf.load_test_data()
    tree = Tree(conf)
    conf.global_num_classes = tree.n_class
    base_model = PCNN_ATT(conf)
    policy = Policy(conf, tree.n_class, base_model)
    policy.cuda()
    policy_optimizer = torch.optim.SGD(policy.parameters(), lr = conf.policy_lr, weight_decay = conf.policy_weight_decay)
    
    for name,parameters in policy.named_parameters():
        print(name, parameters.size())
    criterion = torch.nn.CrossEntropyLoss()
    if conf.is_training :
        train()
    else:
        test()