示例#1
0
    def __init__(self, state_dim, action_dim, agentParam):
        self.state_dim = state_dim  #400#env.observation_space.shape[0]
        self.action_dim = action_dim  # 8#env.action_space.n
        self.gamma = agentParam["gamma"]
        # init N Monte Carlo transitions in one game
        self.saved_log_probs = []
        self.use_cuda = torch.cuda.is_available()
        self.FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor
        self.rewards = []
        self.device = agentParam["device"]
        # init network parameters
        if agentParam["ifload"]:
            self.policy = torch.load(agentParam["filename"] + "pg" +
                                     agentParam["id"] + ".pth",
                                     map_location=torch.device('cuda'))
        else:
            self.policy = Policy(state_dim=self.state_dim,
                                 action_dim=self.action_dim).to(self.device)

        self.optimizer = optim.Adam(self.policy.parameters(),
                                    lr=agentParam["LR"])
        self.eps = np.finfo(np.float32).eps.item()

        # init some parameters
        self.time_step = 0
示例#2
0
def test_policy():
    tf.reset_default_graph()
    tf.set_random_seed(0)
    policy = Policy('global', policy_spec={
        "input size": 2,
        "hidden layer size": 2,
        "number of actions": 2})

    print("Policy Tests: ")
    print("-------------------------------------------------")
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        results = sess.run([policy.action, policy.policy_fn, policy.value_fn],feed_dict={
            policy.input: np.array([[1, 2]]),
            policy.exploration_rate: 1,
        })

    with shelve.open(os.path.join(os.path.dirname(__file__), 'data/network_tests')) as db:
        if 'policy' not in db:
            print(results)
            db['policy'] = results
        elif not np.array([np.any(r == t) for r, t in zip(results, db['policy'])]).all():
            print(results)
            print(db['policy'])
            if input("test_policy: Results didn't match. Update resutls? ") == "yes":
                db['policy'] = results
            else:
                print("test_policy: Test failed!")
                exit()
        else:
            print("test_policy: Test passed!")
    print()
    def __init__(self, args):
        tmp_env = make_env(args.env)
        self.obs_shape = tmp_env.observation_space.shape
        self.num_actions = tmp_env.action_space.n
        self.c_in = self.obs_shape[0]
        del tmp_env

        self.horizon = args.horizon
        self.eta = args.eta
        self.epoch = args.epoch
        self.batch_size = args.batch * args.actors
        self.gamma = args.gamma
        self.lam = args.lam
        self.num_actors = args.actors
        self.eps = args.eps
        self.num_iter = (
            args.epoch * args.actors * args.horizon
        ) // self.batch_size  # how many times to run SGD on the buffer

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        self.queues = [Queue() for i in range(self.num_actors)]
        self.barrier = Queue(
        )  # This is used as a waiting mechanism, to wait for all the agents to env.step()
        self.score_channel = Queue()

        # these are shmem np.arrays
        self.state, self.reward, self.finished = self.init_shared()

        self.workers = [
            Worker(i, args.env, self.queues[i], self.barrier, self.state,
                   self.reward, self.finished, self.score_channel)
            for i in range(self.num_actors)
        ]
        self.start_workers()

        self.model = Policy(self.c_in, self.num_actions).to(self.device)
        self.optim = torch.optim.Adam(self.model.parameters(), lr=self.eta)

        # used for logging and graphing
        self.stat = {
            'scores': [],
            'steps': [],
            'clip_losses': [],
            'value_losses': [],
            'entropies': []
        }
示例#4
0
class PGagent():
    def __init__(self, state_dim, action_dim, agentParam):
        self.state_dim = state_dim  #400#env.observation_space.shape[0]
        self.action_dim = action_dim  # 8#env.action_space.n
        self.gamma = agentParam["gamma"]
        # init N Monte Carlo transitions in one game
        self.saved_log_probs = []
        self.use_cuda = torch.cuda.is_available()
        self.FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor
        self.rewards = []
        self.device = agentParam["device"]
        # init network parameters
        if agentParam["ifload"]:
            self.policy = torch.load(agentParam["filename"] + "pg" +
                                     agentParam["id"] + ".pth",
                                     map_location=torch.device('cuda'))
        else:
            self.policy = Policy(state_dim=self.state_dim,
                                 action_dim=self.action_dim).to(self.device)

        self.optimizer = optim.Adam(self.policy.parameters(),
                                    lr=agentParam["LR"])
        self.eps = np.finfo(np.float32).eps.item()

        # init some parameters
        self.time_step = 0

    def select_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.policy(state.to(self.device))
        m = Categorical(probs)
        action = m.sample()
        self.saved_log_probs.append(m.log_prob(action).to(self.device))
        return action.item()

    def update(self):
        R = 0
        policy_loss = []
        returns = []
        for r in self.rewards[::-1]:
            R = r + self.gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns).to(self.device).type(self.FloatTensor)
        returns = (returns - returns.mean()) / (returns.std() + self.eps)
        for log_prob, R in zip(self.saved_log_probs, returns):
            policy_loss.append(-log_prob * R)
        self.optimizer.zero_grad()
        policy_loss = torch.cat(policy_loss).sum()
        policy_loss.backward()
        self.optimizer.step()
        del self.rewards[:]
        del self.saved_log_probs[:]
示例#5
0
    def __init__(self, env_name, batch_size, gamma, use_random_features):

        self.random = use_random_features
        self.batch_size = batch_size  # batch_size == number of envs

        self.queues = [Queue() for i in range(batch_size)]
        self.barrier = Queue(
        )  # use to block Trainer until all envs finish updating
        self.channel = Queue(
        )  # envs send their total scores after each episode

        tmp_env = make_env(env_name)
        self.c_in = tmp_env.observation_space.shape[0]
        self.num_actions = tmp_env.action_space.n
        mean, std = self.mean_std_from_random_agent(tmp_env, 10000)

        # sh_state is shared between processes
        self.sh_state = self.init_shared(tmp_env.observation_space.shape)

        self.workers = [
            Worker(i, env_name, self.queues[i], self.barrier, self.channel,
                   self.sh_state, mean, std) for i in range(batch_size)
        ]
        self.start_workers()

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.gamma = gamma  # reward discounting factor

        self.model = Policy(self.c_in, self.num_actions).to(self.device)
        self.icm = IntrinsicCuriosityModule(self.c_in, self.num_actions,
                                            self.random).to(self.device)

        self.optim = torch.optim.Adam(list(self.model.parameters()) +
                                      list(self.icm.parameters()),
                                      lr=1e-3)
        self.cross_entropy = torch.nn.CrossEntropyLoss()
示例#6
0
class Trainer:
    def __init__(self, env_name, batch_size, gamma, use_random_features):

        self.random = use_random_features
        self.batch_size = batch_size  # batch_size == number of envs

        self.queues = [Queue() for i in range(batch_size)]
        self.barrier = Queue(
        )  # use to block Trainer until all envs finish updating
        self.channel = Queue(
        )  # envs send their total scores after each episode

        tmp_env = make_env(env_name)
        self.c_in = tmp_env.observation_space.shape[0]
        self.num_actions = tmp_env.action_space.n
        mean, std = self.mean_std_from_random_agent(tmp_env, 10000)

        # sh_state is shared between processes
        self.sh_state = self.init_shared(tmp_env.observation_space.shape)

        self.workers = [
            Worker(i, env_name, self.queues[i], self.barrier, self.channel,
                   self.sh_state, mean, std) for i in range(batch_size)
        ]
        self.start_workers()

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.gamma = gamma  # reward discounting factor

        self.model = Policy(self.c_in, self.num_actions).to(self.device)
        self.icm = IntrinsicCuriosityModule(self.c_in, self.num_actions,
                                            self.random).to(self.device)

        self.optim = torch.optim.Adam(list(self.model.parameters()) +
                                      list(self.icm.parameters()),
                                      lr=1e-3)
        self.cross_entropy = torch.nn.CrossEntropyLoss()

    def reset_workers(self):
        for q in self.queues:
            q.put(-1)

    def broadcast_actions(self, actions):
        for i in range(self.batch_size):
            self.queues[i].put(actions[i].item())

    def start_workers(self):
        for worker in self.workers:
            worker.start()

    def stop_workers(self):
        for q in self.queues:
            q.put(None)

    def wait_for_workers(self):
        for i in range(self.batch_size):
            self.barrier.get()

    def init_shared(self, obs_shape):
        shape = (self.batch_size, ) + obs_shape

        state = np.zeros(shape, dtype=np.float32)
        state = RawArray(c_float, state.reshape(-1))
        state = np.frombuffer(state, c_float).reshape(shape)

        return state

    @staticmethod
    def mean_std_from_random_agent(env, steps):
        obs = np.empty((steps, ) + env.observation_space.shape,
                       dtype=np.float32)

        env.reset()
        for i in range(steps):
            state, _, done, _ = env.step(env.action_space.sample())
            obs[i] = np.array(state)
            if done:
                env.reset()
        mean = np.mean(obs, 0)
        std = np.std(obs, 0).mean()
        return mean, std

    def train(self, T_max, graph_name=None):
        step = 0
        self.num_lookahead = 5

        self.reset_workers()
        self.wait_for_workers()

        stat = {
            'ploss': [],
            'vloss': [],
            'score': [],
            'int_reward': [],
            'entropy': [],
            'fwd_kl_div': [],
            'running_loss': 0
        }

        reward_tracker = RunningMeanStd()
        reward_buffer = np.empty((self.batch_size, self.num_lookahead),
                                 dtype=np.float32)
        while step < T_max:

            # these will keep tensors, which we'll use later for backpropagation
            values = []
            log_probs = []
            rewards = []
            entropies = []

            actions = []
            actions_pred = []
            features = []
            features_pred = []

            state = torch.from_numpy(self.sh_state).to(self.device)

            for i in range(self.num_lookahead):
                step += self.batch_size

                logit, value = self.model(state)
                prob = torch.softmax(logit, dim=1)
                log_prob = torch.log_softmax(logit, dim=1)
                entropy = -(prob * log_prob).sum(1, keepdim=True)

                action = prob.multinomial(1)
                sampled_lp = log_prob.gather(1, action)

                # one-hot action
                oh_action = torch.zeros(self.batch_size,
                                        self.num_actions,
                                        device=self.device).scatter_(
                                            1, action, 1)

                self.broadcast_actions(action)
                self.wait_for_workers()

                next_state = torch.from_numpy(self.sh_state).to(self.device)
                s1, s1_pred, action_pred = self.icm(state, oh_action,
                                                    next_state)

                with torch.no_grad():
                    int_reward = 0.5 * (s1 - s1_pred).pow(2).sum(dim=1,
                                                                 keepdim=True)
                reward_buffer[:, i] = int_reward.cpu().numpy().ravel()

                state = next_state

                # save variables for gradient descent
                values.append(value)
                log_probs.append(sampled_lp)
                rewards.append(int_reward)
                entropies.append(entropy)

                if not self.random:
                    actions.append(action.flatten())
                    actions_pred.append(action_pred)
                features.append(s1)
                features_pred.append(s1_pred)

                stat['entropy'].append(entropy.sum(dim=1).mean().item())
                stat['fwd_kl_div'].append(
                    torch.kl_div(s1_pred, s1).mean().item())

            # may have to update reward_buffer with gamma first
            reward_mean, reward_std, count = mpi_moments(reward_buffer.ravel())
            reward_tracker.update_from_moments(reward_mean, reward_std**2,
                                               count)
            std = np.sqrt(reward_tracker.var)
            rewards = [rwd / std for rwd in rewards]
            for rwd in rewards:
                stat['int_reward'].append(rwd.mean().item())

            state = torch.from_numpy(self.sh_state.astype(np.float32)).to(
                self.device)
            with torch.no_grad():
                _, R = self.model(state)  # R is the estimated return

            values.append(R)

            ploss = 0
            vloss = 0
            fwd_loss = 0
            inv_loss = 0

            delta = torch.zeros((self.batch_size, 1),
                                dtype=torch.float,
                                device=self.device)
            for i in reversed(range(self.num_lookahead)):
                R = rewards[i] + self.gamma * R
                advantage = R - values[i]
                vloss += (0.5 * advantage.pow(2)).mean()

                delta = rewards[i] + self.gamma * values[
                    i + 1].detach() - values[i].detach()
                ploss += -(log_probs[i] * delta +
                           0.01 * entropies[i]).mean()  # beta = 0.01

                fwd_loss += 0.5 * (features[i] -
                                   features_pred[i]).pow(2).sum(dim=1).mean()
                if not self.random:
                    inv_loss += self.cross_entropy(actions_pred[i], actions[i])

            self.optim.zero_grad()

            # inv_loss is 0 if using random features
            loss = ploss + vloss + fwd_loss + inv_loss  # 2018 Large scale curiosity paper simply sums them (no lambda and beta anymore)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(
                list(self.model.parameters()) + list(self.icm.parameters()),
                40)
            self.optim.step()

            while not self.channel.empty():
                score = self.channel.get()
                stat['score'].append(score)

            stat['ploss'].append(ploss.item() / self.num_lookahead)
            stat['vloss'].append(vloss.item() / self.num_lookahead)
            stat['running_loss'] = 0.99 * stat[
                'running_loss'] + 0.01 * loss.item() / self.num_lookahead

            if len(stat['score']) > 20 and step % (self.batch_size *
                                                   1000) == 0:
                now = datetime.datetime.now().strftime("%H:%M")
                print(
                    f"Step {step: <10} | Running loss: {stat['running_loss']:.4f} | Running score: {np.mean(stat['score'][-10:]):.2f} | Time: {now}"
                )
                if graph_name is not None and step % (self.batch_size *
                                                      10000) == 0:
                    plot(step,
                         stat['score'],
                         stat['int_reward'],
                         stat['ploss'],
                         stat['vloss'],
                         stat['entropy'],
                         name=graph_name)
示例#7
0
from tqdm import tqdm
from datetime import datetime
from time import sleep
from loss import PerfPolicy, PerfValue
from math import sqrt
import sys

#params
gamma = 0.998
limit = 5e3
path_to_chkpt = 'weights.tar'
cpu = torch.device('cpu')  #pylint: disable=no-member
gpu = torch.device('cuda:0')  #pylint: disable=no-member

#networks
P = Policy()
V = Value()
need_pretrained = not os.path.isfile(path_to_chkpt)
gym = EggnoggGym(need_pretrained, gpu)  #network in gym.observation

#performance measures
Perf_p = PerfPolicy()
Perf_v = PerfValue()

#info
episode = 1
episode_len = []

#init save upon new start
if need_pretrained:
    """print('Initializing weights...')
示例#8
0
def test(model_name, goal_pos=1, EWC_flag=True):

    episode_len = 50  # Length of each game.
    obs_size = 7 * 7  # MiniGrid uses a 7x7 window of visibility.
    act_size = 7  # Seven possible actions (turn left, right, forward, pickup, drop, etc.)
    inner_size = 64  # Number of neurons in two hidden layers.
    avg_reward = 0.0  # For tracking average regard per episode.
    env_name = 'MiniGrid-Empty-8x8-v0'  # Size of the grid

    test_avg_reward = open(
        "data-{model}/test_avg_rewards.txt".format(model=model_name), 'a+')

    # Setup OpenAI Gym environment for guessing game.
    env = gym.make(env_name)
    if goal_pos == 2:
        env.set_posX(2)
        env.set_posY(5)
    elif goal_pos == 3:
        env.set_posX(5)
        env.set_posY(2)

    # Check the model directory
    last_checkpoint = utils.search_last_model('torch_models/', model_name)

    # Instantiate a policy network
    policy = Policy(obs_size=obs_size,
                    act_size=act_size,
                    inner_size=inner_size)

    policy.load_state_dict(
        torch.load("torch_models/{model}/{model}-{step}.pth".format(
            model=model_name, step=last_checkpoint)))
    if EWC_flag:
        try:
            with open("data-{model}/FIM.dat".format(model=model_name),
                      'rb') as f:
                FIM = pickle.load(f)
            policy.set_FIM(FIM)
        except FileNotFoundError:
            with open("data-{model}/nonD_FIM.dat".format(model=model_name),
                      'rb') as f:
                FIM = pickle.load(f)
            policy.set_FIM(FIM)
    print("Loaded previous checkpoint at step {step}.".format(
        step=last_checkpoint))

    # Run forever.
    episodes = 1001
    for step in range(episodes):
        # MiniGrid has a QT5 renderer which is pretty cool.
        env.render('human')
        time.sleep(0.01)

        # Run an episode.
        (states, actions,
         discounted_rewards) = network.run_episode(env, policy, episode_len)
        avg_reward += np.mean(discounted_rewards)

        if step % 100 == 0:
            print('Average reward @ episode {}: {}'.format(
                step + int(last_checkpoint), avg_reward / 100))
            if step != 0:
                test_avg_reward.write(str(avg_reward / 100) + "\n")
            avg_reward = 0.0
def main():
    args = parser.parse_args()
    env_name = args.env_name
    input_file = args.input_file
    checkpoint_file = args.resume
    test_only = args.test_only
    seed = args.seed
    no_gpu = args.no_gpu
    dir_name = args.dir_name
    visualize = args.visualize
    n_test_steps = args.n_test_steps
    log_perf_file = args.log_perf_file
    min_distance = args.min_distance
    max_distance = args.max_distance
    threshold = args.threshold
    y_range = args.y_range
    n_training_samples = args.n_training_samples
    start_index = args.start_index
    exp_name = args.exp_name
    batch_size = args.batch_size
    learning_rate = args.learning_rate
    n_epochs = args.n_epochs

    # Specific to Humanoid - Pybullet
    if visualize and env_name == 'HumanoidBulletEnv-v0':
        spec = gym.envs.registry.env_specs[env_name]
        class_ = gym.envs.registration.load(spec._entry_point)
        env = class_(**{**spec._kwargs}, **{'render': True})
    else:
        env = gym.make(env_name)

    set_global_seed(seed)
    env.seed(seed)

    input_shape = env.observation_space.shape[0] + 3
    output_shape = env.action_space.shape[0]
    net = Policy(input_shape, output_shape)
    if not no_gpu:
        net = net.cuda()
    optimizer = Adam(net.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    epochs = 0

    if checkpoint_file:
        epochs, net, optimizer = load_checkpoint(checkpoint_file, net,
                                                 optimizer)

    if not checkpoint_file and test_only:
        print('ERROR: You have not entered a checkpoint file.')
        return

    if not test_only:
        if not os.path.isfile(input_file):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                    input_file)

        training_file = open(input_file, 'rb')
        old_states = []
        norms = []
        goals = []
        actions = []
        n_samples = -1

        while n_samples - start_index < n_training_samples:
            try:
                old_s, old_g, new_s, new_g, action = pickle.load(training_file)
                n_samples += 1

                if n_samples < start_index:
                    continue

                old_states.append(np.squeeze(np.array(old_s)))
                norms.append(
                    find_norm(np.squeeze(np.array(new_g) - np.array(old_g))))
                goals.append(
                    preprocess_goal(
                        np.squeeze(np.array(new_g) - np.array(old_g))))
                actions.append(np.squeeze(np.array(action)))
            except (EOFError, ValueError):
                break

        old_states = np.array(old_states)
        norms = np.array(norms)
        goals = np.array(goals)
        actions = np.array(actions)

        normalization_factors = {
            'state': [old_states.mean(axis=0),
                      old_states.std(axis=0)],
            'distance_per_step': [norms.mean(axis=0),
                                  norms.std(axis=0)]
        }
        n_file = open(env_name + '_normalization_factors.pkl', 'wb')
        pickle.dump(normalization_factors, n_file)
        n_file.close()

        old_states = normalize(old_states,
                               env_name + '_normalization_factors.pkl',
                               'state')

        # Summary writer for tensorboardX
        writer = {}
        writer['writer'] = SummaryWriter()

        # Split data into training and validation
        indices = np.arange(old_states.shape[0])
        shuffle(indices)
        val_data = np.concatenate(
            (old_states[indices[:int(old_states.shape[0] / 5)]],
             goals[indices[:int(old_states.shape[0] / 5)]]),
            axis=1)
        val_labels = actions[indices[:int(old_states.shape[0] / 5)]]
        training_data = np.concatenate(
            (old_states[indices[int(old_states.shape[0] / 5):]],
             goals[indices[int(old_states.shape[0] / 5):]]),
            axis=1)
        training_labels = actions[indices[int(old_states.shape[0] / 5):]]
        del old_states, norms, goals, actions, indices

        checkpoint_dir = os.path.join(env_name, 'naive_gcp_checkpoints')
        if dir_name:
            checkpoint_dir = os.path.join(checkpoint_dir, dir_name)
        prepare_dir(checkpoint_dir)

        for e in range(epochs, n_epochs):
            ep_loss = []
            # Train network
            for i in range(int(len(training_data) / batch_size) + 1):
                inp = training_data[batch_size * i:batch_size * (i + 1)]
                out = net(
                    convert_to_variable(inp, grad=False, gpu=(not no_gpu)))
                target = training_labels[batch_size * i:batch_size * (i + 1)]
                target = convert_to_variable(np.array(target),
                                             grad=False,
                                             gpu=(not no_gpu))
                loss = criterion(out, target)
                optimizer.zero_grad()
                ep_loss.append(loss.item())
                loss.backward()
                optimizer.step()

            # Validation
            val_loss = []
            for i in range(int(len(val_data) / batch_size) + 1):
                inp = val_data[batch_size * i:batch_size * (i + 1)]
                out = net(
                    convert_to_variable(inp, grad=False, gpu=(not no_gpu)))
                target = val_labels[batch_size * i:batch_size * (i + 1)]
                target = convert_to_variable(np.array(target),
                                             grad=False,
                                             gpu=(not no_gpu))
                loss = criterion(out, target)
                val_loss.append(loss.item())

            writer['iter'] = e + 1
            writer['writer'].add_scalar('data/val_loss',
                                        np.array(val_loss).mean(), e + 1)
            writer['writer'].add_scalar('data/training_loss',
                                        np.array(ep_loss).mean(), e + 1)

            save_checkpoint(
                {
                    'epochs': (e + 1),
                    'state_dict': net.state_dict(),
                    'optimizer': optimizer.state_dict()
                },
                filename=os.path.join(checkpoint_dir,
                                      str(e + 1) + '.pth.tar'))

            print('Epoch:', e + 1)
            print('Training loss:', np.array(ep_loss).mean())
            print('Val loss:', np.array(val_loss).mean())
            print('')

    # Now we use the trained net to see how the agent reaches a different
    # waypoint from the current one.

    success = 0
    failure = 0

    closest_distances = []
    time_to_closest_distances = []

    f = open(env_name + '_normalization_factors.pkl', 'rb')
    normalization_factors = pickle.load(f)
    average_distance = normalization_factors['distance_per_step'][0]

    for i in range(n_test_steps):
        state = env.reset()
        if env_name == 'Ant-v2':
            obs = env.unwrapped.get_body_com('torso')
            target_obs = [
                obs[0] + np.random.uniform(min_distance, max_distance),
                obs[1] + np.random.uniform(-y_range, y_range), obs[2]
            ]
            target_obs = rotate_point(target_obs, env.unwrapped.angle)
            env.unwrapped.sim.model.body_pos[-1] = target_obs
        elif env_name == 'MinitaurBulletEnv-v0':
            obs = env.unwrapped.get_minitaur_position()
            target_obs = [
                obs[0] + np.random.uniform(min_distance, max_distance),
                obs[1] + np.random.uniform(-y_range, y_range), obs[2]
            ]
            target_obs = rotate_point(
                target_obs, env.unwrapped.get_minitaur_rotation_angle())
            env.unwrapped.set_target_position(target_obs)
        elif env_name == 'HumanoidBulletEnv-v0':
            obs = env.unwrapped.robot.get_robot_position()
            target_obs = [
                obs[0] + np.random.uniform(min_distance, max_distance),
                obs[1] + np.random.uniform(-y_range, y_range), obs[2]
            ]
            target_obs = rotate_point(target_obs, env.unwrapped.robot.yaw)
            env.unwrapped.robot.set_target_position(target_obs[0],
                                                    target_obs[1])
        steps = 0
        done = False
        closest_d = distance(obs, target_obs)
        closest_t = 0
        while distance(obs, target_obs) > threshold and not done:
            goal = preprocess_goal(target_obs - obs)
            state = normalize(np.array(state),
                              env_name + '_normalization_factors.pkl')
            inp = np.concatenate([np.squeeze(state), goal])
            inp = convert_to_variable(inp, grad=False, gpu=(not no_gpu))
            action = net(inp).cpu().detach().numpy()
            state, _, done, _ = env.step(action)
            steps += 1
            if env_name == 'MinitaurBulletEnv-v0':
                obs = env.unwrapped.get_minitaur_position()
            elif env_name == 'HumanoidBulletEnv-v0':
                obs = env.unwrapped.robot.get_robot_position()
            if distance(obs, target_obs) < closest_d:
                closest_d = distance(obs, target_obs)
                closest_t = steps
            if visualize:
                env.render()

        if distance(obs, target_obs) <= threshold:
            success += 1
        elif done:
            failure += 1

        if visualize:
            time.sleep(2)

        closest_distances.append(closest_d)
        time_to_closest_distances.append(closest_t)

    print('Successes: %d, Failures: %d, '
          'Closest distance: %f, Time to closest distance: %d' %
          (success, failure, np.mean(closest_distances),
           np.mean(time_to_closest_distances)))

    if log_perf_file:
        f = open(log_perf_file, 'a+')
        f.write(exp_name + ':Seed-' + str(seed) + ',Success-' + str(success) +
                ',Failure-' + str(failure) + ',Closest_distance-' +
                str(closest_distances) + ',Time_to_closest_distance-' +
                str(time_to_closest_distances) + '\n')
        f.close()
示例#10
0
def run(episodes=1600,
        episode_len=50,
        inner_size=64,
        lr=0.001,
        env_name='MiniGrid-Empty-8x8-v0',
        training=False,
        goal_pos=1):

    obs_size = 7 * 7  # MiniGrid uses a 7x7 window of visibility.
    act_size = 7  # Seven possible actions (turn left, right, forward, pickup, drop, etc.)
    avg_reward = 0.0  # For tracking average regard per episode.
    first_write_flag = True  # Need this due to a weird behavior of the library
    need_diag_FIM = True  # Avoid the FIM calculus if not required
    need_nondiag_FIM = False  # Same as above but with non diagonal FIM
    model_name = "EWC_model_diag_FIM_3_tasks"  # Retrieve the correct model if it exists
    EWC_flag = True  # If true, uses ewc_loss

    if not EWC_flag:
        need_nondiag_FIM = False
        need_diag_FIM = False
    # Check whether the data directory exists and, if not, create it with all the necessary stuff.
    if not os.path.exists("data-{model}/".format(model=model_name)):
        print("Task 2 data directory created.")
        os.makedirs("data-{model}/".format(model=model_name))

    output_reward = open("data-{model}/reward.txt".format(model=model_name),
                         'a+')
    output_avg = open("data-{model}/avg_reward.txt".format(model=model_name),
                      'a+')
    output_loss = open("data-{model}/loss.txt".format(model=model_name), 'a+')

    # Setup OpenAI Gym environment for guessing game.
    env = gym.make(env_name)
    if goal_pos == 2:
        env.set_posX(2)
        env.set_posY(5)
    elif goal_pos == 3:
        env.set_posX(5)
        env.set_posY(2)

    # Check the model directory
    last_checkpoint = utils.search_last_model('torch_models/', model_name)

    # Instantiate a policy network
    policy = Policy(obs_size=obs_size,
                    act_size=act_size,
                    inner_size=inner_size)

    # If there's a previous checkpoint, load this instead of using a new one.
    if os.listdir('torch_models/{model}/'.format(model=model_name)):
        policy.load_state_dict(
            torch.load("torch_models/{model}/{model}-{step}.pth".format(
                model=model_name, step=last_checkpoint)))
        if need_diag_FIM and EWC_flag:
            with open("data-{model}/FIM.dat".format(model=model_name),
                      'rb') as f:
                FIM = pickle.load(f)
                policy.set_FIM(FIM)
        elif need_nondiag_FIM and EWC_flag:
            with open("data-{model}/nonD_FIM.dat".format(model=model_name),
                      'rb') as f:
                FIM = pickle.load(f)
                policy.set_FIM(FIM)
        print("Loaded previous checkpoint at step {step}.".format(
            step=last_checkpoint))

    else:
        print("Created new policy agent.")

    # Use the Adam optimizer.
    optimizer = torch.optim.Adam(params=policy.parameters(), lr=lr)

    try:
        for step in range(episodes):
            # MiniGrid has a QT5 renderer which is pretty cool.
            env.render('human')
            time.sleep(0.01)

            # Run an episode.
            (states, actions,
             discounted_rewards) = network.run_episode(env, policy,
                                                       episode_len)

            # From list to np.array, then save every element in the array
            discounted_rewards_np = np.asarray(discounted_rewards)
            if step % 100 == 0 and training:
                output_reward.write(str(discounted_rewards_np) + "\n")
            avg_reward += np.mean(discounted_rewards)

            if step % 100 == 0:
                print('Average reward @ episode {}: {}'.format(
                    step + int(last_checkpoint), avg_reward / 100))
                if not first_write_flag and training:
                    output_avg.write(str(avg_reward / 100) + "\n")
                else:
                    first_write_flag = False
                avg_reward = 0.0

            # Save the model every 1000 steps
            if step % 500 == 0 and training:
                torch.save(
                    policy.state_dict(),
                    'torch_models/{model}/{model}-{step}.pth'.format(
                        model=model_name, step=step + int(last_checkpoint)))
                print("Checkpoint saved.")

            # Repeat each action, and backpropagate discounted
            # rewards. This can probably be batched for efficiency with a
            # memoryless agent...
            if training:
                optimizer.zero_grad()
            episode_loss = []
            for (step, a) in enumerate(actions):
                logits = policy(states[step])
                dist = Categorical(logits=logits)
                if EWC_flag:
                    loss = -dist.log_prob(actions[step]) * discounted_rewards[
                        step] + ewc.ewc_loss(policy, 2)
                else:
                    loss = -dist.log_prob(
                        actions[step]) * discounted_rewards[step]
                loss.backward()
                episode_loss.append(loss.data[0])
            current_loss = sum([x for x in episode_loss]) / episode_len
            if training:
                optimizer.step()
                output_loss.write(str(float(current_loss)) + "\n")

    except KeyboardInterrupt:
        if training:
            print("Training ended.")
        else:
            print("Simulation ended.")

    # Now estimate the diagonal FIM.
    if need_diag_FIM:
        utils.diagonal_FIM(policy, env, episode_len, model_name)
    elif need_nondiag_FIM:
        utils.non_diagonal_FIM(policy, env, episode_len, model_name)
class PPOTrainer:
    def __init__(self, args):
        tmp_env = make_env(args.env)
        self.obs_shape = tmp_env.observation_space.shape
        self.num_actions = tmp_env.action_space.n
        self.c_in = self.obs_shape[0]
        del tmp_env

        self.horizon = args.horizon
        self.eta = args.eta
        self.epoch = args.epoch
        self.batch_size = args.batch * args.actors
        self.gamma = args.gamma
        self.lam = args.lam
        self.num_actors = args.actors
        self.eps = args.eps
        self.num_iter = (
            args.epoch * args.actors * args.horizon
        ) // self.batch_size  # how many times to run SGD on the buffer

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        self.queues = [Queue() for i in range(self.num_actors)]
        self.barrier = Queue(
        )  # This is used as a waiting mechanism, to wait for all the agents to env.step()
        self.score_channel = Queue()

        # these are shmem np.arrays
        self.state, self.reward, self.finished = self.init_shared()

        self.workers = [
            Worker(i, args.env, self.queues[i], self.barrier, self.state,
                   self.reward, self.finished, self.score_channel)
            for i in range(self.num_actors)
        ]
        self.start_workers()

        self.model = Policy(self.c_in, self.num_actions).to(self.device)
        self.optim = torch.optim.Adam(self.model.parameters(), lr=self.eta)

        # used for logging and graphing
        self.stat = {
            'scores': [],
            'steps': [],
            'clip_losses': [],
            'value_losses': [],
            'entropies': []
        }

    def init_shared(self):
        state_shape = (self.num_actors, *self.obs_shape)
        scalar_shape = (self.num_actors, 1)

        state = np.empty(state_shape, dtype=np.float32)
        state = RawArray(c_float, state.reshape(-1))
        state = np.frombuffer(state, c_float).reshape(state_shape)

        reward = np.empty(scalar_shape, dtype=np.float32)
        reward = RawArray(c_float, reward.reshape(-1))
        reward = np.frombuffer(reward, c_float).reshape(scalar_shape)

        finished = np.empty(scalar_shape, dtype=np.float32)
        finished = RawArray(c_float, finished.reshape(-1))
        finished = np.frombuffer(finished, c_float).reshape(scalar_shape)

        return state, reward, finished

    def start_workers(self):
        for worker in self.workers:
            worker.start()

    def initialize_state(self):
        for i in range(self.num_actors):
            self.queues[i].put(-1)
        self.wait_for_agents()

    @timing_wrapper
    def broadcast_actions(self, actions):
        actions = actions.cpu().numpy()
        for i in range(self.num_actors):
            self.queues[i].put(actions[i])
        self.wait_for_agents()

        next_state = torch.tensor(self.state).to(self.device)
        reward = torch.tensor(self.reward).to(self.device)
        done = torch.tensor(self.finished).to(self.device)
        return next_state, reward, done

    def wait_for_agents(self):
        for i in range(self.num_actors):
            self.barrier.get()

    def setup_scheduler(self, T_max):
        num_steps = T_max // (self.horizon * self.num_actors)
        self.scheduler = torch.optim.lr_scheduler.LambdaLR(
            self.optim, lambda x: max(1 - x / num_steps, 0))

    @timing_wrapper
    def train(self, T_max, graph_name=None):
        self.setup_scheduler(T_max)

        global_step = 0

        self.initialize_state()
        state = torch.tensor(self.state).to(self.device)
        while global_step < T_max:

            states = []
            actions = []
            rewards = []
            finished = []
            sampled_lps = []  # sampled log probabilities
            values = []

            time_start = time.time()
            duration_fwd = 0
            with torch.no_grad():
                for t in range(self.horizon):
                    global_step += self.num_actors

                    logit, value = self.model(state)
                    prob = torch.softmax(logit, dim=1)
                    log_prob = torch.log_softmax(logit, dim=1)

                    action = prob.multinomial(1)
                    sampled_lp = log_prob.gather(1, action)

                    (next_state, reward,
                     done), duration_brdcst = self.broadcast_actions(action)

                    # appending to buffer
                    states.append(state)
                    actions.append(action)
                    rewards.append(reward)
                    finished.append(done)
                    sampled_lps.append(sampled_lp)
                    values.append(value)

                    state = next_state

                    duration_fwd += duration_brdcst

                _, V = self.model(next_state)
                values.append(V)

            time_forward = time.time()

            # GAE estimation
            GAEs, duration_GAE = self.compute_GAE(rewards, finished, values)

            duration_backward = self.run_gradient_descent(
                states, actions, sampled_lps, values, GAEs)

            time_end = time.time()

            total_duration = time_end - time_start
            percent_broadcast = duration_fwd / total_duration * 100
            percent_forward = (time_forward -
                               time_start) / total_duration * 100
            percent_GAE = duration_GAE / total_duration * 100
            percent_backward = duration_backward / total_duration * 100

            # print(f"<Time> Total: {total_duration:.2f} | forward: {percent_forward:.2f}% (broadcast {percent_broadcast:.2f}%) | GAE: {percent_GAE:.2f}% | backward: {percent_backward:.2f}%")
            if global_step % (self.num_actors * self.horizon * 30) == 0:
                while not self.score_channel.empty():
                    score, step = self.score_channel.get()
                    self.stat['scores'].append(score)
                    self.stat['steps'].append(step)
                now = datetime.datetime.now().strftime("%H:%M")
                print(
                    f"Step {global_step} | Mean of last 10 scores: {np.mean(self.stat['scores'][-10:]):.2f} | Time: {now}"
                )
                if graph_name is not None:
                    plot(global_step, self.stat, graph_name)
        # Finish
        plot(global_step, self.stat, graph_name)

    @timing_wrapper
    def compute_GAE(self, rewards, finished, values):
        GAEs = []
        advantage = 0
        for i in reversed(range(self.horizon)):
            td_error = rewards[i] + (
                1 - finished[i]) * self.gamma * values[i + 1] - values[i]
            advantage = td_error + (
                1 - finished[i]) * self.gamma * self.lam * advantage
            GAEs.append(advantage)
        GAEs = torch.cat(GAEs[::-1]).to(self.device)

        # NOTE: Below is currently not in use because I don't know how to incorporate the 'finished' tensor into account
        # NOTE: This version is much, much faster than the python-looped version above
        # NOTE: But in terms of the total time taken, it doesn't make much of a difference. (~2% compared to ~0.05%)
        # rewards = torch.stack(rewards)
        # finished = torch.stack(finished)
        # values = torch.stack(values)

        # td_error = rewards + (1 - finished) * self.gamma * values[1:] - values[:-1]
        # td_error = td_error.cpu()

        # GAEs = scipy.signal.lfilter([1], [1, -self.gamma * self.lam], td_error.flip(dims=(0,)), axis=0)
        # GAEs = np.flip(GAEs, axis=0)  # flip it back again
        # GAEs = GAEs.reshape(-1, GAEs.shape[-1])  # (horizon, num_actors, 1) --> (horizon * num_actors, 1)
        # GAEs = torch.tensor(GAEs).float().to(self.device)

        return GAEs

    @timing_wrapper
    def run_gradient_descent(self, states, actions, sampled_lps, values, GAEs):

        states = torch.cat(states)
        actions = torch.cat(actions)
        sampled_lps = torch.cat(sampled_lps)
        values = torch.cat(values[:-1])
        targets = GAEs + values

        self.scheduler.step()
        # Running SGD for K epochs
        for it in range(self.num_iter):
            # Batch indices
            idx = np.random.randint(0, self.horizon * self.num_actors,
                                    self.batch_size)

            state = states[idx]
            action = actions[idx]
            sampled_lp = sampled_lps[idx]
            GAE = GAEs[idx]
            value = values[idx]
            target = targets[idx]

            # Normalize advantages
            GAE = (GAE - GAE.mean()) / (GAE.std() + 1e-8)

            logit_new, value_new = self.model(state)
            # Clipped values are needed because sometimes values can unexpectedly get really big
            clipped_value_new = value + torch.clamp(value_new - value,
                                                    -self.eps, self.eps)

            # Calculating policy loss
            prob_new = torch.softmax(logit_new, dim=1)
            lp_new = torch.log_softmax(logit_new, dim=1)
            entropy = -(prob_new * lp_new).sum(1).mean()

            sampled_lp_new = lp_new.gather(1, action)

            ratio = torch.exp(sampled_lp_new - sampled_lp)
            surr1 = ratio * GAE
            surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * GAE
            clip_loss = torch.min(surr1, surr2).mean()

            # Calculating value loss
            value_loss1 = (value_new - target).pow(2)
            value_loss2 = (clipped_value_new - target).pow(2)
            value_loss = 0.5 * torch.max(value_loss1, value_loss2).mean()

            final_loss = -clip_loss + value_loss - 0.01 * entropy

            self.optim.zero_grad()
            final_loss.backward()

            # total_norm = 0
            # for p in self.model.parameters():
            #     param_norm = p.grad.data.norm(2)
            #     total_norm += param_norm.item() ** 2
            # total_norm = total_norm ** (1. / 2)
            # print(total_norm)

            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
            self.optim.step()

            # graphing
            self.stat['clip_losses'].append(clip_loss.item())
            self.stat['value_losses'].append(value_loss.item())
            self.stat['entropies'].append(entropy.item())