예제 #1
0
def main():
    # env = PybulletPhonebotSubprocEnv()
    action_size = 8
    num_env = 4

    def get_env(index: int):
        # env = PybulletPhonebotEnv(sim_settings=PybulletSimulatorSettings(
        # render=False, random_orientation=True))
        env = PybulletPhonebotSubprocEnv(
            PybulletSimulatorSettings(render=False, random_orientation=True))
        env.set_seed(index)
        env.reset()
        return env

    env = MultiEnv(get_env, num_env)
    while True:
        print(env.sense())
        res = env.step([np.zeros(action_size) for _ in range(num_env)])
        print(res[0], res[1], res[2], res[3])
        time.sleep(0.1)
        break
예제 #2
0
def train():
    args = parse_a2c_args()
    args2 = parse_a2c_args()
    output_dir = initialize_logging(args)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    num_updates = int(
        args.num_frames) // args.num_steps // args.num_environments
    # Create the train and test environments with Multiple processes
    train_envs = MultiEnv(args.simulator,
                          args.num_environments,
                          args,
                          is_train=True)

    #Création des environnements de test des niveaux classiques
    args2.scenario_dir = "scenarios_transfer_learning/mazes_classic_test/"
    args2.scenario = "custom_scenario_test{:003}.cfg"
    classic_test_envs = MultiEnv(args.simulator,
                                 args.num_environments,
                                 args2,
                                 is_train=False)
    #Création des environnements de test des niveaux peignes
    args2.scenario_dir = "scenarios_transfer_learning/little_combs_test/"
    little_combs_test_envs = MultiEnv(args.simulator,
                                      args.num_environments,
                                      args2,
                                      is_train=False)
    args2.scenario_dir = "scenarios_transfer_learning/medium_combs_test/"
    medium_combs_test_envs = MultiEnv(args.simulator,
                                      args.num_environments,
                                      args2,
                                      is_train=False)

    test_envs = MultiEnv(args.simulator,
                         args.num_environments,
                         args,
                         is_train=False)

    # Writer will output to ./runs/ directory by default
    writer = torch.utils.tensorboard.SummaryWriter()

    obs_shape = train_envs.obs_shape

    # The agent's policy network and training algorithm A2C
    policy = CNNPolicy(obs_shape, args).to(device)
    agent = A2CAgent(policy,
                     args.hidden_size,
                     value_weight=args.value_loss_coef,
                     entropy_weight=args.entropy_coef,
                     num_steps=args.num_steps,
                     num_parallel=args.num_environments,
                     gamma=args.gamma,
                     lr=args.learning_rate,
                     opt_alpha=args.alpha,
                     opt_momentum=args.momentum,
                     max_grad_norm=args.max_grad_norm)

    start_j = 0
    if args.reload_model:
        checkpoint_idx = args.reload_model.split(',')[1]
        checkpoint_filename = '{}models/base_line.pth.tar'.format(output_dir)
        agent.load_model(checkpoint_filename)
        start_j = 0  #(int(checkpoint_idx) // args.num_steps // args.num_environments) + 1

    obs = train_envs.reset()
    start = time.time()
    nb_of_saves = 0

    for j in range(start_j, num_updates):
        print("------", j / num_updates * 100, "-------")

        # Test des performances du modèle
        if not args.skip_eval and j % args.eval_freq == 0:
            total_num_steps = (j + 1) * args.num_environments * args.num_steps
            mean_rewards_classic, game_times_classic = agent.evaluate(
                classic_test_envs, j, total_num_steps)
            mean_rewards_little, game_times_little = agent.evaluate(
                little_combs_test_envs, j, total_num_steps)
            mean_rewards_medium, game_times_medium = agent.evaluate(
                medium_combs_test_envs, j, total_num_steps)

            # succes_classic = sum([1 if i!=525 else 0 for i in game_times_classic])/16
            #  succes_little = sum([1 if i!=525 else 0 for i in game_times_little])/16
            # succes_medium = sum([1 if i!=525 else 0 for i in game_times_medium])/16

            writer.add_scalar("Reward classic levels", mean_rewards_classic, j)
            writer.add_scalar("Reward little combs levels",
                              mean_rewards_little, j)
            writer.add_scalar("Reward medium combs levels",
                              mean_rewards_medium, j)
        # writer.add_scalar("Success rate classic levels", succes_classic, j)
        # writer.add_scalar("Success rate little combs levels", succes_little, j)
        # writer.add_scalar("Success rate medium combs levels", succes_medium, j)

        for step in range(args.num_steps):
            action = agent.get_action(obs, step)
            obs, reward, done, info = train_envs.step(action)
            agent.add_rewards_masks(reward, done, step)

        report = agent.update(obs)

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_environments * args.num_steps
            save_num_steps = (start_j) * args.num_environments * args.num_steps
            FPS = int((total_num_steps - save_num_steps) / (end - start)),

            logging.info(report.format(j, total_num_steps, FPS))

        if j % args.model_save_rate == 0:
            nb_of_saves += 1
            agent.save_policy2(nb_of_saves, args, output_dir)

    # cancel the env processes
    train_envs.cancel()
    test_envs.cancel()
예제 #3
0
파일: train.py 프로젝트: agajews/deep
def train(model,
          optim,
          env_fn,
          num_envs,
          num_stack,
          num_steps,
          num_updates,
          gamma,
          value_loss_coef,
          entropy_coef,
          max_grad_norm,
          log_freq=10):
    envs = MultiEnv(env_fn, num_envs)

    model.cuda()

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * num_stack, obs_shape[1], obs_shape[2])

    states = torch.zeros(num_steps + 1, num_envs, *obs_shape)
    current_state = torch.zeros(num_envs, *obs_shape)

    def update_current_state(state):
        state = torch.from_numpy(np.stack(state)).float()
        current_state[:, :-1] = current_state[:, 1:]
        current_state[:, -1] = state

    state = envs.reset()
    update_current_state(state)

    rewards = torch.zeros(num_steps, num_envs, 1)
    value_preds = torch.zeros(num_steps + 1, num_envs, 1)
    old_log_probs = torch.zeros(num_steps, num_envs, envs.action_space.n)
    returns = torch.zeros(num_steps + 1, num_envs, 1)

    actions = torch.LongTensor(num_steps, num_envs)
    masks = torch.zeros(num_steps, num_envs, 1)

    # These variables are used to compute reward stats for all processes.
    episode_rewards = torch.zeros([num_envs, 1])
    final_rewards = torch.zeros([num_envs, 1])

    states = states.cuda()
    current_state = current_state.cuda()
    rewards = rewards.cuda()
    value_preds = value_preds.cuda()
    old_log_probs = old_log_probs.cuda()
    returns = returns.cuda()
    actions = actions.cuda()
    masks = masks.cuda()

    for j in range(num_updates):
        for step in range(num_steps):
            # Sample actions
            value, logits = model(Variable(states[step], volatile=True))
            probs = F.softmax(logits)
            log_probs = F.log_softmax(logits).data
            actions[step] = probs.multinomial().data

            cpu_actions = actions[step].cpu()
            cpu_actions = cpu_actions.numpy()

            # Observe reward and next state
            state, reward, done, info = envs.step(cpu_actions)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            np_masks = np.array([0.0 if done_ else 1.0 for done_ in done])

            # If done then clean the history of observations.
            pt_masks = torch.from_numpy(
                np_masks.reshape(np_masks.shape[0], 1, 1, 1)).float()
            pt_masks = pt_masks.cuda()
            current_state *= pt_masks

            update_current_state(state)
            states[step + 1].copy_(current_state)
            value_preds[step].copy_(value.data)
            old_log_probs[step].copy_(log_probs)
            rewards[step].copy_(reward)
            masks[step].copy_(torch.from_numpy(np_masks).unsqueeze(1))

            final_rewards *= masks[step].cpu()
            final_rewards += (1 - masks[step].cpu()) * episode_rewards

            episode_rewards *= masks[step].cpu()

        returns[-1] = model(Variable(states[-1], volatile=True))[0].data
        for step in reversed(range(num_steps)):
            returns[step] = returns[step + 1] * \
                gamma * masks[step] + rewards[step]

        # Reshape to do in a single forward pass for all steps
        values, logits = model(
            Variable(states[:-1].view(-1,
                                      *states.size()[-3:])))
        log_probs = F.log_softmax(logits)

        # Unreshape
        logits_size = (num_steps, num_envs, logits.size(-1))

        log_probs = F.log_softmax(logits).view(logits_size)
        probs = F.softmax(logits).view(logits_size)

        values = values.view(num_steps, num_envs, 1)
        logits = logits.view(logits_size)

        action_log_probs = log_probs.gather(2, Variable(actions.unsqueeze(2)))

        dist_entropy = -(log_probs * probs).sum(-1).mean()

        advantages = Variable(returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        loss = value_loss * value_loss_coef + action_loss - dist_entropy * entropy_coef
        optim.zero_grad()
        loss.backward()

        nn.utils.clip_grad_norm(model.parameters(), max_grad_norm)
        optim.step()

        states[0].copy_(states[-1])

        if j % log_freq == 0:
            print(
                "Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, j * num_envs * num_steps, final_rewards.mean(),
                        final_rewards.median(), final_rewards.min(),
                        final_rewards.max(), -dist_entropy.data[0],
                        value_loss.data[0], action_loss.data[0]))
예제 #4
0
def train(model, create_env, num_envs, optimizer, gamma, num_updates,
          max_episode_length, steps_per_update):
    # torch.manual_seed(args.seed)

    # env.seed(args.seed)

    model.train()

    env = MultiEnv(create_env, num_envs)
    state = env.reset()  # list of states for each concurrent env
    state = torch.from_numpy(state)
    episode_done = True

    episode_length = 0
    update = 0
    while update < num_updates:
        episode_length += 1

        values = []
        log_action_probs = []
        rewards = []
        entropies = []

        for step in range(steps_per_update):
            # list of values and action logits for each concurrent env
            value, action_logit = model(Variable(state))
            action_prob = F.softmax(action_logit)
            log_action_prob = F.log_softmax(action_logit)
            entropy = -(log_action_prob * action_prob).sum(1)
            entropies.append(entropy)

            action = action_prob.multinomial().data
            log_action_prob = log_action_prob.gather(1, Variable(action))

            state, reward, episode_done, _ = env.step(action.numpy())
            if episode_length >= max_episode_length:
                episode_done = True
            reward = max(min(reward, 1), -1)

            state = torch.from_numpy(state)
            values.append(value)
            log_action_probs.append(log_action_prob)
            rewards.append(reward)

            if episode_done:
                episode_length = 0
                state = env.reset()
                break

        R = torch.zeros(1, 1)
        if not episode_done:
            value, _ = model(Variable(state))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        advantage = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            advantage = Variable(advantage * gamma + rewards[i] +
                                 gamma * values[i + 1].data - values[i].data)

            policy_loss = policy_loss - log_action_probs[
                i] * advantage - 0.01 * entropies[i]

        loss = policy_loss + 0.5 * value_loss

        optimizer.zero_grad()

        loss.backward()
        nn.utils.clip_grad_norm(model.parameters(), 40)

        optimizer.step()
        update += 1
예제 #5
0
파일: ppo_v2.py 프로젝트: yycho0108/PPO
def train(opts: Settings):
    # === INSTANTIATE ENVIRONMENT ===
    # gym.make() but with imports configured as specified in arg.
    _gym_make = partial(gym_make, opts.imports)
    subproc_gym_make = subproc(_gym_make)

    # If `opts.subproc==True`, invoke gym.make() in a subprocess,
    # and treat the resultant instance as a `gym.Env`.
    make_env = subproc_gym_make if opts.subproc else _gym_make

    def get_env(index: int):
        env = make_env(opts.env_id)
        env.seed(index)
        env.reset()
        return env

    env = MultiEnv(get_env, opts.num_envs)
    entry_type = [
        ('state', env.observation_space.dtype, env.observation_space.shape),
        ('action', env.action_space.dtype, env.action_space.shape),
        ('reward', np.float32, (1, )),
        # ('state1', env.observation_space.dtype, env.observation_space.shape),
        ('done', np.bool, (1, )),
        ('value', np.float32, (1, )),
        ('log_prob', np.float32, env.action_space.shape)
    ]

    # === NORMALIZERS FOR INPUTS ===
    reward_normalizer = ExponentialMovingGaussian(
        alpha=opts.reward_normalizer_alpha)
    state_normalizer = ExponentialMovingGaussian(
        alpha=opts.state_normalizer_alpha)

    # === INSTANTIATE MEMORY ===
    memory = ContiguousRingBuffer(capacity=opts.update_steps,
                                  dims=(opts.num_envs, ),
                                  dtype=entry_type)

    # === INSTANTIATE POLICY ===
    # FIXME(ycho): Instead of assuming 1D box spaces,
    # explicitly wrap envs with flatten()...
    device = th.device(opts.device)
    policy = AC(env.observation_space.shape[0], env.action_space.shape[0],
                opts.ac).to(device)

    # === INSTANTIATE AGENT ===
    ppo = PPO(policy, memory, device, opts.ppo)

    # === TRAIN ===
    states = env.reset()
    dones = np.full((opts.num_envs, 1), False, dtype=np.bool)
    returns = np.zeros(opts.num_envs, dtype=np.float32)

    # === LOGGER ===
    # TODO(ycho): Configure logger
    writer = SummaryWriter()
    writer.add_graph(policy, th.as_tensor(states).float().to(device))

    # === CALLBACKS ===
    save_cb = SaveCallback(
        opts.save_steps, opts.ckpt_path, lambda: {
            'settings': opts,
            'state_dict': policy.state_dict(),
            'reward_normalizer': reward_normalizer.params(),
            'state_normalizer': state_normalizer.params()
        })

    # === VARIABLES FOR DEBUGGING / LOG TRACKING ===
    reset_count = 0
    start_time = time.time()

    # === START TRAINING ===
    step = 0
    while step < opts.max_steps:
        # Reset any env that has reached termination.
        # FIXME(ycho): assumes isinstance(env, MultiEnv), of course.
        for i in range(opts.num_envs):
            if not dones[i]:
                continue
            states[i][:] = env.envs[i].reset()
            returns[i] = 0.0
            reset_count += 1

        # NOTE(ycho): Workaround for the current limitation of `MultiEnv`.
        # action = [env.action_space.sample() for _ in range(opts.num_envs)]
        # sanitize `states` arg.
        states = np.asarray(states).astype(np.float32)

        # Add states to stats for normalization.
        for s in states:
            state_normalizer.add(s)

        # Normalize states in-place.
        states = state_normalizer.normalize(states)
        states = np.clip(states, -10.0, 10.0)  # clip to +-10 stddev

        with th.no_grad():
            action, value, log_prob = ppo.act(states, True)

        # NOTE(ycho): Clip action within valid domain...
        clipped_action = np.clip(action, env.action_space.low,
                                 env.action_space.high)

        # Step according to above action.
        out = env.step(clipped_action)

        # Format entry.
        nxt_states, rewards, dones, _ = out

        # Add rewards to stats for normalization.
        # returns[np.asarray(dones).reshape(-1).astype(np.bool)] = 0.0
        returns = returns * opts.gae.gamma + np.reshape(rewards, -1)
        # NOTE(ycho): collect stats on `returns` instead of `rewards`.
        # for r in rewards:
        #    reward_normalizer.add(r)
        for r in returns:
            reward_normalizer.add(r)

        # Train if buffer full ...
        if memory.is_full:
            writer.add_scalar('reward_mean',
                              reward_normalizer.mean,
                              global_step=step)
            writer.add_scalar('reward_var',
                              reward_normalizer.var,
                              global_step=step)
            writer.add_scalar('log_std',
                              policy.log_std.detach().cpu().numpy()[0],
                              global_step=step)
            writer.add_scalar('fps',
                              step / (time.time() - start_time),
                              global_step=step)

            # NOTE(ycho): Don't rely on printed reward stats for tracking
            # training progress ... use tensorboard instead.
            print('== step {} =='.format(step))
            # Log reward before overwriting with normalized values.
            print('rew = mean {} min {} max {} std {}'.format(
                memory['reward'].mean(), memory['reward'].min(),
                memory['reward'].max(), memory['reward'].std()))
            # print('rm {} rv {}'.format(reward_normalizer.mean,
            #                           reward_normalizer.var))

            # NOTE(ycho): States have already been normalized,
            # since those states were utilized as input for PPO action.
            # After that, the normalized states were inserted in memory.
            # memory['state'] = state_normalizer.normalize(memory['state'])

            # NOTE(ycho): I think it's fine to delay reward normalization to this point.
            # memory['reward'] = reward_normalizer.normalize(memory['reward'])
            # NOTE(ycho): maybe the proper thing to do is:
            # memory['reward'] = (memory['reward'] - reward_normalizer.mean) / np.sqrt(return_normalizer.var)
            memory['reward'] /= np.sqrt(reward_normalizer.var)
            memory['reward'] = np.clip(memory['reward'], -10.0, 10.0)

            # Create training data slices from memory ...
            dones = np.asarray(dones).reshape(opts.num_envs, 1)
            advs, rets = gae(memory, value, dones, opts.gae)
            # print('std = {}'.format(ppo.policy.log_std.exp()))

            ucount = 0
            info = None
            for _ in range(opts.num_epochs):
                for i in range(0, len(memory), opts.batch_size):
                    # Prepare current minibatch dataset ...
                    exp = memory[i:i + opts.batch_size]
                    act = exp['action']
                    obs = exp['state']
                    old_lp = exp['log_prob']
                    # old_v = exp['value'] # NOTE(ycho): unused
                    adv = advs[i:i + opts.batch_size]
                    ret = rets[i:i + opts.batch_size]

                    # Evaluate what had been done ...
                    # NOTE(ycho): wouldn't new_v == old_v
                    # and new_lp == old_lp for the very first one in the batch??
                    # hmm ....
                    new_v, new_lp, entropy = ppo.evaluate(
                        obs.copy(), act.copy())

                    info_i = {}
                    loss = ppo.compute_loss(obs.copy(), act.copy(),
                                            old_lp.copy(), new_v, new_lp,
                                            entropy, adv, ret, info_i)

                    # NOTE(ycho): Below, only required for logging
                    if True:
                        with th.no_grad():
                            if info is None:
                                info = info_i
                            else:
                                for k in info.keys():
                                    info[k] += info_i[k]
                        ucount += 1

                    # Optimization step
                    ppo.optimizer.zero_grad()
                    loss.backward()
                    # Clip grad norm
                    th.nn.utils.clip_grad_norm_(ppo.policy.parameters(),
                                                opts.ppo.max_grad_norm)
                    ppo.optimizer.step()

            for k, v in info.items():
                writer.add_scalar(k,
                                  v.detach().cpu().numpy() / ucount,
                                  global_step=step)

            # Empty the memory !
            memory.reset()

        # Append to memory.
        entry = list(
            zip(*(
                states,
                action,
                rewards,
                # nxt_states,
                dones,
                value,
                log_prob)))
        memory.append(entry)

        # Cache `states`, update steps and continue.
        states = nxt_states
        step += opts.num_envs

        save_cb.on_step(step)

    writer.close()

    # Save ...
    th.save(
        {
            'settings': opts,
            'state_dict': policy.state_dict(),
            'reward_normalizer': reward_normalizer.params(),
            'state_normalizer': state_normalizer.params()
        }, opts.model_path)
예제 #6
0
def train():
    args = parse_a2c_args()
    output_dir = initialize_logging(args)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    num_updates = int(args.num_frames) // args.num_steps // args.num_environments
    # Create the train and test environments with Multiple processes
    train_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=True)
    test_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=False)
    
    obs_shape = train_envs.obs_shape
    
    # The agent's policy network and training algorithm A2C
    policy = CNNPolicy(obs_shape, args).to(device)
    agent = A2CAgent(policy, 
                     args.hidden_size,
                     value_weight=args.value_loss_coef, 
                     entropy_weight=args.entropy_coef, 
                     num_steps=args.num_steps, 
                     num_parallel=args.num_environments,
                     gamma=args.gamma,
                     lr=args.learning_rate,
                     opt_alpha=args.alpha,
                     opt_momentum=args.momentum,
                     max_grad_norm=args.max_grad_norm)
    
    start_j = 0
    if args.reload_model:
        checkpoint_idx = args.reload_model.split(',')[1]
        checkpoint_filename = '{}models/checkpoint_{}.pth.tar'.format(output_dir, checkpoint_idx)        
        agent.load_model(checkpoint_filename)
        start_j = (int(checkpoint_idx) // args.num_steps // args.num_environments) + 1
        
    obs = train_envs.reset()
    start = time.time()
    
    for j in range(start_j, num_updates):
        if not args.skip_eval and j % args.eval_freq == 0:
            total_num_steps = (j + 1) * args.num_environments * args.num_steps
            mean_rewards, game_times = agent.evaluate(test_envs, j, total_num_steps)
            logging.info(mean_rewards)
            logging.info(game_times)
            
        for step in range(args.num_steps): 
            action = agent.get_action(obs, step)
            obs, reward, done, info = train_envs.step(action)
            agent.add_rewards_masks(reward, done, step)
            
        report = agent.update(obs)
        
        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_environments * args.num_steps
            save_num_steps = (start_j) * args.num_environments * args.num_steps
            FPS = int((total_num_steps - save_num_steps) / (end - start)),
            
            logging.info(report.format(j, total_num_steps, FPS))  
        
        if j % args.model_save_rate == 0:
            total_num_steps = (j + 1) * args.num_environments * args.num_steps
            agent.save_policy(total_num_steps, args, output_dir)
        
    # cancel the env processes    
    train_envs.cancel()
    test_envs.cancel()