예제 #1
0
    def Play(self, f_path, deterministic):
        # Autodetect CUDA
        use_cuda = torch.cuda.is_available()
        device = torch.device("cuda" if use_cuda else "cpu")

        env = RemoteVecEnv(2)

        num_inputs = env.observation_space.shape[0]
        num_outputs = env.action_space.shape[0]
        model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE).to(device)
        check_point = torch.load(f_path)
        model.load_state_dict(check_point['state_dict'])

        state = env.reset(True)
        done = False
        total_steps = 0
        total_reward = 0
        while not done:
            # env.render()
            state = torch.FloatTensor(state).unsqueeze(0).to(device)
            dist, _ = model(state)
            action = dist.mean.detach().cpu().numpy()[0] if deterministic \
                else dist.sample().cpu().numpy()[0]
            next_state, reward, done, _ = env.step(action, True)
            state = next_state
            total_reward += reward
            total_steps += 1
        # env.env.close()
        self.logQueue.put(
            pprint.pformat("In %d steps we got %.3f reward" %
                           (total_steps, total_reward)))
예제 #2
0
    writer = SummaryWriter(comment="ppo_" + args.name)
    
    # Autodetect CUDA
    use_cuda = torch.cuda.is_available()
    device   = torch.device("cuda" if use_cuda else "cpu")
    print('Device:', device)
    
    # Prepare environments
    envs = [make_env() for i in range(NUM_ENVS)]   # make multiple envs (ENV_ID) for training
    envs = SubprocVecEnv(envs)   # ??
    env = gym.make(ENV_ID)       # make env for testing
    num_inputs  = envs.observation_space.shape[0]
    num_outputs = envs.action_space.shape[0]

    # A simple Actor-Critic network using FC neural net
    model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE).to(device)
    print(model)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    frame_idx  = 0
    train_epoch = 0
    best_reward = None

    state = envs.reset()
    early_stop = False  # variable used to stop the while loop (when target reward reached)

    while not early_stop:  # Loop until target reward reached

        log_probs = []
        values    = []
        states    = []
예제 #3
0
                        "--deterministic",
                        default=True,
                        action="store_true",
                        help="enable deterministic actions")
    args = parser.parse_args()

    # Autodetect CUDA
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    env = gym.make(args.env)

    print("OKAY")
    num_inputs = env.observation_space.shape[0]
    num_outputs = env.action_space.shape[0]
    model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE).to(device)
    model = torch.load(args.model)

    #Training: sampling actions semi-randomly from the prob dist output by the network, so we get exploration
    # Testing: deterministic not random
    for i in range(20):  #numner of videos
        state = env.reset()
        done = False
        total_steps = 0
        total_reward = 0

        while not done:
            state = torch.FloatTensor(state).unsqueeze(0).to(device)
            dist, _ = model(state)
            #continous action space instead of sampling based on the mean and stdf, we use means
            #deterministic action space we would take the arg max of probaliblies
예제 #4
0
    use_cuda = torch.cuda.is_available()
    device   = torch.device("cuda" if use_cuda else "cpu")
    device = torch.device("cpu")
    print('Device:', device)
    # ss('here')
    # Prepare environments
    envs = [make_env() for i in range(NUM_ENVS)]
    envs = SubprocVecEnv(envs)
    env = gym.make(ENV_ID)
    num_inputs  = envs.observation_space.shape[0]
    # print(envs.observation_space)
    # print(envs.action_space.n)
    num_outputs = envs.action_space.n
    # num_outputs = 2
    # ss('here')
    model = ActorCritic(num_inputs, num_outputs).to(device)
    # ss('model define')
    # print(model)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    frame_idx  = 0
    train_epoch = 0
    best_reward = None

    state = envs.reset()
    early_stop = False

    while not early_stop:

        log_probs = []
        values    = []
예제 #5
0
        "-r",
        "--record",
        help="If specified, sets the recording dir, default=Disabled")
    args = parser.parse_args()

    # Autodetect CUDA
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    env = RemoteVecEnv(2)
    # if args.record:
    #     env = gym.wrappers.Monitor(env, args.record, force=True)

    num_inputs = env.observation_space.shape[0]
    num_outputs = env.action_space.shape[0]
    model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE).to(device)
    model.load_state_dict(torch.load(args.model))

    state = env.reset(True)
    done = False
    total_steps = 0
    total_reward = 0
    while not done:
        # env.render()
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        action = dist.mean.detach().cpu().numpy()[0] if args.deterministic \
            else dist.sample().cpu().numpy()[0]
        next_state, reward, done, _ = env.step(action, True)
        state = next_state
        total_reward += reward
예제 #6
0
    # writer = SummaryWriter(comment="ppo_" + args.name)
    writer = SummaryWriter("logs-{}".format(args.name))

    # Autodetect CUDA
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    print('Device:', device)

    # Prepare parallel nvironments
    envs = [make_env() for i in range(NUM_ENVS)]
    envs = SubprocVecEnv(envs)
    env = gym.make(ENV_ID)
    num_inputs = envs.observation_space.shape[0]
    num_outputs = envs.action_space.shape[0]

    model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE).to(device)
    print(model)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    print("parameters:", model.parameters())

    frame_idx = 0
    train_epoch = 0  #one complte update cycle
    best_reward = None

    state = envs.reset()  # 8 actions, 8 next states, 8 rewards, and 8 dones
    early_stop = False

    while not early_stop:
        #storing training data
        log_probs = []
        values = []
예제 #7
0
class PPO_Train:
    def __init__(self, logQueue):
        self.settings = {
            'NUM_ENVS': 8,
            'ENV_NAME': 'HopperRobot',
            'HIDDEN_SIZE': 256,
            'LEARNING_RATE': 1e-4,
            'GAMMA': 0.99,
            'GAE_LAMBDA': 0.95,
            'PPO_EPSILON': 0.2,
            'CRITIC_DISCOUNT': 0.5,
            'ENTROPY_BETA': 0.001,
            'PPO_STEPS': 128,
            'MINI_BATCH_SIZE': 256,
            'PPO_EPOCHS': 30,
            'TEST_EPOCHS': 10,
            'NUM_TESTS': 10,
            'TARGET_REWARD': 50000
        }
        self.logQueue = logQueue

    def test_env(self, env, model, device, deterministic=True):
        state = env.reset(True)
        done = False
        total_reward = 0
        while not done:
            # env.render()
            state = torch.FloatTensor(state).unsqueeze(0).to(device)
            dist, _ = model(state)
            action = dist.mean.detach().cpu().numpy()[0] if deterministic \
                else dist.sample().cpu().numpy()[0]
            next_state, reward, done, _ = env.step(action, True)
            state = next_state
            total_reward += reward
            time.sleep(0.05)
        return total_reward

    def normalize(self, x):
        x -= x.mean()
        x /= (x.std() + 1e-8)
        return x

    def compute_gae(self,
                    next_value,
                    rewards,
                    masks,
                    values,
                    gamma=GAMMA,
                    lam=GAE_LAMBDA):
        values = values + [next_value]
        gae = 0
        returns = []
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * values[
                step + 1] * masks[step] - values[step]
            gae = delta + gamma * lam * masks[step] * gae
            # prepend to get correct order back
            returns.insert(0, gae + values[step])
        return returns

    def ppo_iter(self, states, actions, log_probs, returns, advantage):
        batch_size = states.size(0)
        ids = np.random.permutation(batch_size)
        ids = np.split(ids[:batch_size // MINI_BATCH_SIZE * MINI_BATCH_SIZE],
                       batch_size // MINI_BATCH_SIZE)
        for i in range(len(ids)):
            yield states[ids[i], :], actions[ids[i], :], log_probs[
                ids[i], :], returns[ids[i], :], advantage[ids[i], :]
        # generates random mini-batches until we have covered the full batch
        # for _ in range(batch_size // MINI_BATCH_SIZE):
        #     rand_ids = np.random.randint(0, batch_size, MINI_BATCH_SIZE)
        #     yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[
        #                                                                                                    rand_ids, :]

    def ppo_update(self,
                   frame_idx,
                   states,
                   actions,
                   log_probs,
                   returns,
                   advantages,
                   clip_param=PPO_EPSILON):
        count_steps = 0
        sum_returns = 0.0
        sum_advantage = 0.0
        sum_loss_actor = 0.0
        sum_loss_critic = 0.0
        sum_entropy = 0.0
        sum_loss_total = 0.0

        # PPO EPOCHS is the number of times we will go through ALL the training data to make updates
        for _ in range(PPO_EPOCHS):
            # grabs random mini-batches several times until we have covered all data
            for state, action, old_log_probs, return_, advantage in self.ppo_iter(
                    states, actions, log_probs, returns, advantages):
                dist, value = self.model(state)
                entropy = dist.entropy().mean()
                new_log_probs = dist.log_prob(action)

                ratio = (new_log_probs - old_log_probs).exp()
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - clip_param,
                                    1.0 + clip_param) * advantage

                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = (return_ - value).pow(2).mean()

                loss = CRITIC_DISCOUNT * critic_loss + actor_loss - ENTROPY_BETA * entropy

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                # track statistics
                sum_returns += return_.mean()
                sum_advantage += advantage.mean()
                sum_loss_actor += actor_loss
                sum_loss_critic += critic_loss
                sum_loss_total += loss
                sum_entropy += entropy

                count_steps += 1

        self.writer.add_scalar("returns", sum_returns / count_steps, frame_idx)
        self.writer.add_scalar("advantage", sum_advantage / count_steps,
                               frame_idx)
        self.writer.add_scalar("loss_actor", sum_loss_actor / count_steps,
                               frame_idx)
        self.writer.add_scalar("loss_critic", sum_loss_critic / count_steps,
                               frame_idx)
        self.writer.add_scalar("entropy", sum_entropy / count_steps, frame_idx)
        self.writer.add_scalar("loss_total", sum_loss_total / count_steps,
                               frame_idx)

    def save_ckp(self, state, fname):
        torch.save(state, fname)

    def Start(self, debugOutputQueue, pauseQueue, fromSavedModel=''):
        mkdir('.', 'checkpoints')
        parser = argparse.ArgumentParser()
        parser.add_argument("-n",
                            "--name",
                            default=self.settings['ENV_NAME'],
                            help="Name of the run")
        args = parser.parse_args()

        # Autodetect CUDA
        use_cuda = torch.cuda.is_available()
        device = torch.device("cuda" if use_cuda else "cpu")
        self.logQueue.put(pprint.pformat('Device:' + device.type))

        # Prepare environments
        envs = RemoteVecEnv(NUM_ENVS)
        num_inputs = envs.observation_space.shape[0]
        num_outputs = envs.action_space.shape[0]

        frame_idx = 0
        train_epoch = 0
        best_reward = None

        self.model = ActorCritic(num_inputs, num_outputs,
                                 HIDDEN_SIZE).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE)
        self.writer = SummaryWriter(comment="ppo_" + args.name)

        if fromSavedModel == '':
            self.logQueue.put('Successfully make 8 remote environment')
            self.logQueue.put(pprint.pformat(self.model))
        else:
            check_point = torch.load(fromSavedModel)
            self.model.load_state_dict(check_point['state_dict'])
            self.optimizer.load_state_dict(check_point['optimizer'])
            train_epoch = check_point['epoch']
            frame_idx = check_point['frame_idx']
            self.logQueue.put('Successfully load model from ' + fromSavedModel)

        state = envs.reset()
        early_stop = False
        save_count = 0

        while not early_stop:
            log_probs = []
            values = []
            states = []
            actions = []
            rewards = []
            masks = []

            for _ in range(PPO_STEPS):
                state = torch.FloatTensor(state).to(device)
                dist, value = self.model(state)

                action = dist.sample()
                # each state, reward, done is a list of results from each parallel environment
                action_exp = action.cpu().numpy()
                action_exp = np.clip(action_exp, -10, 10)
                next_state, reward, done, _ = envs.step(action_exp)
                log_prob = dist.log_prob(action)

                log_probs.append(log_prob)
                values.append(value)
                rewards.append(
                    torch.FloatTensor(reward).unsqueeze(1).to(device))
                masks.append(
                    torch.FloatTensor(1 - done).unsqueeze(1).to(device))

                states.append(state)
                actions.append(action)

                state = next_state
                frame_idx += 1

                debugData = (next_state, reward, done, action_exp)
                debugOutputQueue.put(debugData)

                while pauseQueue.qsize() > 0:
                    if pauseQueue.qsize() == 1:
                        time.sleep(1)
                    else:
                        while not pauseQueue.empty():
                            pauseQueue.get()

            next_state = torch.FloatTensor(next_state).to(device)
            _, next_value = self.model(next_state)
            returns = self.compute_gae(next_value, rewards, masks, values)

            returns = torch.cat(returns).detach()
            log_probs = torch.cat(log_probs).detach()
            values = torch.cat(values).detach()
            states = torch.cat(states)
            actions = torch.cat(actions)
            advantage = returns - values
            advantage = self.normalize(advantage)

            self.ppo_update(frame_idx, states, actions, log_probs, returns,
                            advantage)
            train_epoch += 1

            if train_epoch % TEST_EPOCHS == 0:
                test_reward = np.mean([
                    self.test_env(envs, self.model, device)
                    for _ in range(NUM_TESTS)
                ])
                self.writer.add_scalar("test_rewards", test_reward, frame_idx)
                self.logQueue.put(
                    pprint.pformat('Frame %s. reward: %s' %
                                   (frame_idx, test_reward)))
                # Save a checkpoint every time we achieve a best reward
                if best_reward is None or best_reward < test_reward:
                    if best_reward is not None:
                        self.logQueue.put(
                            pprint.pformat(
                                "Best reward updated: %.3f -> %.3f" %
                                (best_reward, test_reward)))
                        name = "%s_best_%+.3f_%d.dat" % (
                            args.name, test_reward, frame_idx)
                        fname = os.path.join('.', 'checkpoints', name)
                        check_point = {
                            'epoch': train_epoch,
                            'state_dict': self.model.state_dict(),
                            'optimizer': self.optimizer.state_dict(),
                            'frame_idx': frame_idx,
                        }
                        # self.save_ckp(check_point, fname)
                        # torch.save(self.model.state_dict(), fname)
                        torch.save(check_point, fname)
                    best_reward = test_reward
                if test_reward > TARGET_REWARD: early_stop = True

                save_count += 1
                if save_count >= 15:
                    self.logQueue.put(
                        pprint.pformat('Saving checkpoint for frame: ' +
                                       str(frame_idx)))
                    name = "%s_frame_%d.dat" % (args.name, frame_idx)
                    fname = os.path.join('.', 'checkpoints', name)
                    check_point = {
                        'epoch': train_epoch,
                        'state_dict': self.model.state_dict(),
                        'optimizer': self.optimizer.state_dict(),
                        'frame_idx': frame_idx,
                    }
                    torch.save(check_point, fname)
                    save_count = 0

    def Play(self, f_path, deterministic):
        # Autodetect CUDA
        use_cuda = torch.cuda.is_available()
        device = torch.device("cuda" if use_cuda else "cpu")

        env = RemoteVecEnv(2)

        num_inputs = env.observation_space.shape[0]
        num_outputs = env.action_space.shape[0]
        model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE).to(device)
        check_point = torch.load(f_path)
        model.load_state_dict(check_point['state_dict'])

        state = env.reset(True)
        done = False
        total_steps = 0
        total_reward = 0
        while not done:
            # env.render()
            state = torch.FloatTensor(state).unsqueeze(0).to(device)
            dist, _ = model(state)
            action = dist.mean.detach().cpu().numpy()[0] if deterministic \
                else dist.sample().cpu().numpy()[0]
            next_state, reward, done, _ = env.step(action, True)
            state = next_state
            total_reward += reward
            total_steps += 1
        # env.env.close()
        self.logQueue.put(
            pprint.pformat("In %d steps we got %.3f reward" %
                           (total_steps, total_reward)))
예제 #8
0
    def Start(self, debugOutputQueue, pauseQueue, fromSavedModel=''):
        mkdir('.', 'checkpoints')
        parser = argparse.ArgumentParser()
        parser.add_argument("-n",
                            "--name",
                            default=self.settings['ENV_NAME'],
                            help="Name of the run")
        args = parser.parse_args()

        # Autodetect CUDA
        use_cuda = torch.cuda.is_available()
        device = torch.device("cuda" if use_cuda else "cpu")
        self.logQueue.put(pprint.pformat('Device:' + device.type))

        # Prepare environments
        envs = RemoteVecEnv(NUM_ENVS)
        num_inputs = envs.observation_space.shape[0]
        num_outputs = envs.action_space.shape[0]

        frame_idx = 0
        train_epoch = 0
        best_reward = None

        self.model = ActorCritic(num_inputs, num_outputs,
                                 HIDDEN_SIZE).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE)
        self.writer = SummaryWriter(comment="ppo_" + args.name)

        if fromSavedModel == '':
            self.logQueue.put('Successfully make 8 remote environment')
            self.logQueue.put(pprint.pformat(self.model))
        else:
            check_point = torch.load(fromSavedModel)
            self.model.load_state_dict(check_point['state_dict'])
            self.optimizer.load_state_dict(check_point['optimizer'])
            train_epoch = check_point['epoch']
            frame_idx = check_point['frame_idx']
            self.logQueue.put('Successfully load model from ' + fromSavedModel)

        state = envs.reset()
        early_stop = False
        save_count = 0

        while not early_stop:
            log_probs = []
            values = []
            states = []
            actions = []
            rewards = []
            masks = []

            for _ in range(PPO_STEPS):
                state = torch.FloatTensor(state).to(device)
                dist, value = self.model(state)

                action = dist.sample()
                # each state, reward, done is a list of results from each parallel environment
                action_exp = action.cpu().numpy()
                action_exp = np.clip(action_exp, -10, 10)
                next_state, reward, done, _ = envs.step(action_exp)
                log_prob = dist.log_prob(action)

                log_probs.append(log_prob)
                values.append(value)
                rewards.append(
                    torch.FloatTensor(reward).unsqueeze(1).to(device))
                masks.append(
                    torch.FloatTensor(1 - done).unsqueeze(1).to(device))

                states.append(state)
                actions.append(action)

                state = next_state
                frame_idx += 1

                debugData = (next_state, reward, done, action_exp)
                debugOutputQueue.put(debugData)

                while pauseQueue.qsize() > 0:
                    if pauseQueue.qsize() == 1:
                        time.sleep(1)
                    else:
                        while not pauseQueue.empty():
                            pauseQueue.get()

            next_state = torch.FloatTensor(next_state).to(device)
            _, next_value = self.model(next_state)
            returns = self.compute_gae(next_value, rewards, masks, values)

            returns = torch.cat(returns).detach()
            log_probs = torch.cat(log_probs).detach()
            values = torch.cat(values).detach()
            states = torch.cat(states)
            actions = torch.cat(actions)
            advantage = returns - values
            advantage = self.normalize(advantage)

            self.ppo_update(frame_idx, states, actions, log_probs, returns,
                            advantage)
            train_epoch += 1

            if train_epoch % TEST_EPOCHS == 0:
                test_reward = np.mean([
                    self.test_env(envs, self.model, device)
                    for _ in range(NUM_TESTS)
                ])
                self.writer.add_scalar("test_rewards", test_reward, frame_idx)
                self.logQueue.put(
                    pprint.pformat('Frame %s. reward: %s' %
                                   (frame_idx, test_reward)))
                # Save a checkpoint every time we achieve a best reward
                if best_reward is None or best_reward < test_reward:
                    if best_reward is not None:
                        self.logQueue.put(
                            pprint.pformat(
                                "Best reward updated: %.3f -> %.3f" %
                                (best_reward, test_reward)))
                        name = "%s_best_%+.3f_%d.dat" % (
                            args.name, test_reward, frame_idx)
                        fname = os.path.join('.', 'checkpoints', name)
                        check_point = {
                            'epoch': train_epoch,
                            'state_dict': self.model.state_dict(),
                            'optimizer': self.optimizer.state_dict(),
                            'frame_idx': frame_idx,
                        }
                        # self.save_ckp(check_point, fname)
                        # torch.save(self.model.state_dict(), fname)
                        torch.save(check_point, fname)
                    best_reward = test_reward
                if test_reward > TARGET_REWARD: early_stop = True

                save_count += 1
                if save_count >= 15:
                    self.logQueue.put(
                        pprint.pformat('Saving checkpoint for frame: ' +
                                       str(frame_idx)))
                    name = "%s_frame_%d.dat" % (args.name, frame_idx)
                    fname = os.path.join('.', 'checkpoints', name)
                    check_point = {
                        'epoch': train_epoch,
                        'state_dict': self.model.state_dict(),
                        'optimizer': self.optimizer.state_dict(),
                        'frame_idx': frame_idx,
                    }
                    torch.save(check_point, fname)
                    save_count = 0