예제 #1
0
def generate_expert_trajectorys(args):

    env = launch_env()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    expert = PurePursuitExpert(env=env)

    for episode in range(0, args.episodes):
        print("Starting episode", episode)
        observations = []
        actions = []
        for steps in range(0, args.steps):
            # use our 'expert' to predict the next action.
            action = expert.predict(None)
            observation, reward, done, info = env.step(action)
            observations.append(observation)
            actions.append(action)
            # env.render()
        env.reset()
        torch.save(actions, '{}/data_a_{}.pt'.format(args.data_directory,
                                                     episode))
        torch.save(observations,
                   '{}/data_o_{}.pt'.format(args.data_directory, episode))

    env.close()
def _enjoy():          
    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")
    policy.load(filename='ddpg', directory='reinforcement/pytorch/models/')

    obs = env.reset()
    done = False

    while True:
        while not done:
            action = policy.predict(np.array(obs))
            # Perform action
            obs, reward, done, _ = env.step(action)
            env.render()
        done = False
        obs = env.reset()        
예제 #3
0
def _enjoy(args):

    from learning.utils.env import launch_env
    from learning.utils.wrappers import NormalizeWrapper, ImgWrapper, \
        DtRewardWrapper, ActionWrapper, ResizeWrapper
    from learning.utils.teacher import PurePursuitExpert
    # model = Model(action_dim=2, max_action=1.)
    model = Generator(action_dim=2)

    try:
        # state_dict = torch.load('models/imitate.pt', map_location=device)
        state_dict = torch.load('models/G{}.pt'.format(args.enjoy_tag),
                                map_location=device)

        model.load_state_dict(state_dict)
    except:
        print("Unexpected error:", sys.exc_info()[0])
        print('failed to load model')
        exit()

    model.eval().to(device)

    env = launch_env()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)

    obs = env.reset()

    # max_count = 0
    while True:
        obs = torch.from_numpy(obs).float().to(device).unsqueeze(0)

        action = model(obs)
        action = action.squeeze().data.cpu().numpy()
        print("\nAction taken::", action, "\n")
        obs, reward, done, info = env.step(action)
        env.render()

        # if max_count > 50:
        #     max_count = 0
        #     obs = env.reset()

        if done:
            if reward < 0:
                print('*** FAILED ***')
                time.sleep(0.7)
            # max_count += 1
            obs = env.reset()
            env.render()
예제 #4
0
def _train(args):
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)

    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    # Set seeds
    seed(args.seed)

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")
    replay_buffer = ReplayBuffer(args.replay_buffer_max_size)
    print("Initialized DDPG")

    # Evaluate untrained policy
    evaluations = [evaluate_policy(env, policy)]

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    episode_reward = None
    env_counter = 0
    reward = 0
    print("Starting training")
    while total_timesteps < args.max_timesteps:

        print("timestep: {} | reward: {}".format(total_timesteps, reward))

        if done:
            if total_timesteps != 0:
                print(
                    ("Total T: %d Episode Num: %d Episode T: %d Reward: %f") %
                    (total_timesteps, episode_num, episode_timesteps,
                     episode_reward))
                policy.train(replay_buffer, episode_timesteps, args.batch_size,
                             args.discount, args.tau)

                # Evaluate episode
                if timesteps_since_eval >= args.eval_freq:
                    timesteps_since_eval %= args.eval_freq
                    evaluations.append(evaluate_policy(env, policy))
                    print("rewards at time {}: {}".format(
                        total_timesteps, evaluations[-1]))

                    if args.save_models:
                        policy.save(file_name, directory=args.model_dir)
                    np.savez("./results/{}.npz".format(file_name), evaluations)

            # Reset environment
            env_counter += 1
            obs = env.reset()
            done = False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Select action randomly or according to policy
        if total_timesteps < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = policy.predict(np.array(obs))
            if args.expl_noise != 0:
                action = (action + np.random.normal(
                    0, args.expl_noise, size=env.action_space.shape[0])).clip(
                        env.action_space.low, env.action_space.high)

        # Perform action
        new_obs, reward, done, _ = env.step(action)

        if episode_timesteps >= args.env_timesteps:
            done = True

        done_bool = 0 if episode_timesteps + 1 == args.env_timesteps else float(
            done)
        episode_reward += reward

        # Store data in replay buffer
        replay_buffer.add(obs, new_obs, action, reward, done_bool)

        obs = new_obs

        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    print("Training done, about to save..")
    policy.save(filename='ddpg', directory=args.model_dir)
    print("Finished saving..should return now!")
class Worker(mp.Process):
    def __init__(self, global_net, optimizer, args, info, identifier, logger):
        super(Worker, self).__init__()
        self.global_net = global_net
        self.optimizer = optimizer
        self.args = args
        self.info = info
        self.identifier = identifier
        self.name = f'worker-{identifier}'
        self.total_step = 0
        self.ckpt_dir, self.ckpt_path, self.log_dir = logger.get_log_dirs()

    def calc_loss(self, args, values, log_probs, actions, rewards):
        np_values = values.view(-1).data.numpy()

        # Actor loss: Generalized Advantage Estimation A = R(lamdda) - V(s), Schulman
        # Paper:  High-Dimensional Continuous Control Using Generalized Advantage Estimation
        delta_t = np.asarray(
            rewards) + args.gamma * np_values[1:] - np_values[:-1]
        advantage = discount(delta_t, args.gamma)

        # Select log probabilities of the actions the agent executed
        action_log_probabilities = log_probs.gather(
            1,
            torch.tensor(actions).view(-1, 1))
        policy_loss = -(action_log_probabilities.view(-1) *
                        torch.FloatTensor(advantage.copy())).sum()

        # Critic loss: l2 loss over value estimator
        rewards[-1] += args.gamma * np_values[-1]
        discounted_reward = discount(np.asarray(rewards), args.gamma)
        discounted_reward = torch.tensor(discounted_reward.copy(),
                                         dtype=torch.float32)
        value_loss = .5 * (discounted_reward - values[:-1, 0]).pow(2).sum()

        # Entropy - Used for regularization
        # Entropy is a metric for the distribution of probabilities
        # -> We want to maximize entropy to encourage exploration
        entropy_loss = (-log_probs * torch.exp(log_probs)).sum()
        return policy_loss + 0.5 * value_loss - 0.01 * entropy_loss

    def run(self):
        from learning.utils.env import launch_env
        from learning.utils.wrappers import NormalizeWrapper, ImgWrapper, \
            DtRewardWrapper, ActionWrapper, ResizeWrapper, DiscreteWrapper_a6

        # We have to initialize the gym here, otherwise the multiprocessing will crash
        self.env = launch_env()
        # self.env = ResizeWrapper(self.env)
        # self.env = NormalizeWrapper(self.env)
        self.env = ImgWrapper(
            self.env)  # to convert the images from 160x120x3 into 3x160x120
        # self.env = ActionWrapper(self.env)
        self.env = DtRewardWrapper(self.env)
        self.env = DiscreteWrapper_a6(self.env)

        # Set seeds so we can reproduce our results
        self.env.seed(self.args.seed + self.identifier)
        torch.manual_seed(self.args.seed + self.identifier)

        self.local_net = Net(1, self.env.action_space.n)  # local network
        state = torch.tensor(preprocess_state(self.env.reset()))

        # bookkeeping
        start_time = last_disp_time = time.time()
        episode_length, epr, eploss, done = 0, 0, 0, True

        render_this_episode = False

        while self.info['frames'][0] <= self.args.max_steps:
            render_this_episode = self.args.graphical_output and (
                render_this_episode or
                (self.info['episodes'] % 10 == 0 and self.identifier == 0))

            # Sync parameters from global net
            self.local_net.load_state_dict(self.global_net.state_dict())

            # Reset hidden state of GRU cell / Remove hidden state from computational graph
            hx = torch.zeros(1, 256) if done else hx.detach()

            # Values used to compute gradients
            values, log_probs, actions, rewards = [], [], [], []

            for step in range(self.args.steps_until_sync):
                episode_length += 1

                # Inference
                value, logit, hx = self.local_net.forward(
                    (state.view(-1, 1, 80, 80), hx))
                action_log_probs = F.log_softmax(logit, dim=-1)

                # Sample an action from the distribution
                action = torch.exp(action_log_probs).multinomial(
                    num_samples=1).data[0]
                np_action = action.numpy()[0]

                done = False
                for x in range(self.args.action_update_steps):
                    if not done:
                        state, reward, done, _ = self.env.step(np_action)
                        reward += reward

                state = torch.tensor(preprocess_state(state))
                epr += reward
                # reward = np.clip(reward, -1, 1)
                done = done or episode_length >= self.args.max_episode_steps

                if render_this_episode:
                    self.env.render()

                self.info['frames'].add_(1)
                num_frames = int(self.info['frames'].item())

                elapsed = time.time() - start_time

                if done:  # Update statistics and save model frequently
                    self.info['episodes'] += 1

                    # Moving average statistics:
                    # Linear interpolation between the current average and the new value
                    # Allows us to better estimate quality of results with high variance
                    interp_factor = 1 if self.info['episodes'][
                        0] == 1 else 1 - 0.99
                    self.info['run_epr'].mul_(1 - interp_factor).add_(
                        interp_factor * epr)
                    self.info['run_loss'].mul_(1 - interp_factor).add_(
                        interp_factor * eploss)

                    # Save model every 100_000 episodes
                    if self.args.save_models and self.info['episodes'][
                            0] % self.args.save_frequency == 0:
                        with open(
                                f"{self.log_dir}/performance-{self.name}.txt",
                                "a") as myfile:
                            myfile.write(
                                f"{self.info['episodes'].item():.0f} {num_frames} {epr}"
                                +
                                f"{self.info['run_loss'].item()} {elapsed}\n")

                        torch.save(
                            {
                                'model_state_dict':
                                self.global_net.state_dict(),
                                'optimizer_state_dict':
                                self.optimizer.state_dict(),
                                'info': self.info
                            },
                            f"{self.ckpt_dir}/model-{self.name}-{int(self.info['episodes'].item())}.pth"
                        )

                        print(
                            "Saved model to:",
                            f"{self.ckpt_dir}/model-{self.name}-{self.info['episodes'].item()}"
                        )

                # print training info every minute
                if self.identifier == 0 and time.time() - last_disp_time > 60:
                    elapsed = time.strftime(
                        "%Hh %Mm %Ss", time.gmtime(time.time() - start_time))
                    print(
                        f"[time]: {elapsed}, [episodes]: {self.info['episodes'].item(): .0f},"
                        +
                        f" [frames]: {num_frames: .0f}, [mean epr]:{self.info['run_epr'].item():.2f},"
                        + f" [run loss]: {self.info['run_loss'].item(): .2f}")
                    last_disp_time = time.time()

                # reset buffers / environment
                if done:
                    episode_length, epr, eploss = 0, 0, 0
                state = torch.tensor(preprocess_state(self.env.reset()))

                values.append(value)
                log_probs.append(action_log_probs)
                actions.append(action)
                rewards.append(reward)

            # Reached sync step -> We need a terminal value
            # If the episode did not end use estimation of V(s) to bootstrap
            next_value = torch.zeros(1, 1) if done else self.local_net.forward(
                (state.unsqueeze(0), hx))[0]
            values.append(next_value.detach())

            # Calculate loss
            loss = self.calc_loss(self.args, torch.cat(values),
                                  torch.cat(log_probs), torch.cat(actions),
                                  np.asarray(rewards))
            eploss += loss.item()

            # Calculate gradient
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.local_net.parameters(), 40)

            # sync gradients with global network
            for param, shared_param in zip(self.local_net.parameters(),
                                           self.global_net.parameters()):
                if shared_param.grad is None:
                    shared_param._grad = param.grad

            # Backpropagation
            self.optimizer.step()
    def run(self):
        from learning.utils.env import launch_env
        from learning.utils.wrappers import NormalizeWrapper, ImgWrapper, \
            DtRewardWrapper, ActionWrapper, ResizeWrapper, DiscreteWrapper_a6

        # We have to initialize the gym here, otherwise the multiprocessing will crash
        self.env = launch_env()
        # self.env = ResizeWrapper(self.env)
        # self.env = NormalizeWrapper(self.env)
        self.env = ImgWrapper(
            self.env)  # to convert the images from 160x120x3 into 3x160x120
        # self.env = ActionWrapper(self.env)
        self.env = DtRewardWrapper(self.env)
        self.env = DiscreteWrapper_a6(self.env)

        # Set seeds so we can reproduce our results
        self.env.seed(self.args.seed + self.identifier)
        torch.manual_seed(self.args.seed + self.identifier)

        self.local_net = Net(1, self.env.action_space.n)  # local network
        state = torch.tensor(preprocess_state(self.env.reset()))

        # bookkeeping
        start_time = last_disp_time = time.time()
        episode_length, epr, eploss, done = 0, 0, 0, True

        render_this_episode = False

        while self.info['frames'][0] <= self.args.max_steps:
            render_this_episode = self.args.graphical_output and (
                render_this_episode or
                (self.info['episodes'] % 10 == 0 and self.identifier == 0))

            # Sync parameters from global net
            self.local_net.load_state_dict(self.global_net.state_dict())

            # Reset hidden state of GRU cell / Remove hidden state from computational graph
            hx = torch.zeros(1, 256) if done else hx.detach()

            # Values used to compute gradients
            values, log_probs, actions, rewards = [], [], [], []

            for step in range(self.args.steps_until_sync):
                episode_length += 1

                # Inference
                value, logit, hx = self.local_net.forward(
                    (state.view(-1, 1, 80, 80), hx))
                action_log_probs = F.log_softmax(logit, dim=-1)

                # Sample an action from the distribution
                action = torch.exp(action_log_probs).multinomial(
                    num_samples=1).data[0]
                np_action = action.numpy()[0]

                done = False
                for x in range(self.args.action_update_steps):
                    if not done:
                        state, reward, done, _ = self.env.step(np_action)
                        reward += reward

                state = torch.tensor(preprocess_state(state))
                epr += reward
                # reward = np.clip(reward, -1, 1)
                done = done or episode_length >= self.args.max_episode_steps

                if render_this_episode:
                    self.env.render()

                self.info['frames'].add_(1)
                num_frames = int(self.info['frames'].item())

                elapsed = time.time() - start_time

                if done:  # Update statistics and save model frequently
                    self.info['episodes'] += 1

                    # Moving average statistics:
                    # Linear interpolation between the current average and the new value
                    # Allows us to better estimate quality of results with high variance
                    interp_factor = 1 if self.info['episodes'][
                        0] == 1 else 1 - 0.99
                    self.info['run_epr'].mul_(1 - interp_factor).add_(
                        interp_factor * epr)
                    self.info['run_loss'].mul_(1 - interp_factor).add_(
                        interp_factor * eploss)

                    # Save model every 100_000 episodes
                    if self.args.save_models and self.info['episodes'][
                            0] % self.args.save_frequency == 0:
                        with open(
                                f"{self.log_dir}/performance-{self.name}.txt",
                                "a") as myfile:
                            myfile.write(
                                f"{self.info['episodes'].item():.0f} {num_frames} {epr}"
                                +
                                f"{self.info['run_loss'].item()} {elapsed}\n")

                        torch.save(
                            {
                                'model_state_dict':
                                self.global_net.state_dict(),
                                'optimizer_state_dict':
                                self.optimizer.state_dict(),
                                'info': self.info
                            },
                            f"{self.ckpt_dir}/model-{self.name}-{int(self.info['episodes'].item())}.pth"
                        )

                        print(
                            "Saved model to:",
                            f"{self.ckpt_dir}/model-{self.name}-{self.info['episodes'].item()}"
                        )

                # print training info every minute
                if self.identifier == 0 and time.time() - last_disp_time > 60:
                    elapsed = time.strftime(
                        "%Hh %Mm %Ss", time.gmtime(time.time() - start_time))
                    print(
                        f"[time]: {elapsed}, [episodes]: {self.info['episodes'].item(): .0f},"
                        +
                        f" [frames]: {num_frames: .0f}, [mean epr]:{self.info['run_epr'].item():.2f},"
                        + f" [run loss]: {self.info['run_loss'].item(): .2f}")
                    last_disp_time = time.time()

                # reset buffers / environment
                if done:
                    episode_length, epr, eploss = 0, 0, 0
                state = torch.tensor(preprocess_state(self.env.reset()))

                values.append(value)
                log_probs.append(action_log_probs)
                actions.append(action)
                rewards.append(reward)

            # Reached sync step -> We need a terminal value
            # If the episode did not end use estimation of V(s) to bootstrap
            next_value = torch.zeros(1, 1) if done else self.local_net.forward(
                (state.unsqueeze(0), hx))[0]
            values.append(next_value.detach())

            # Calculate loss
            loss = self.calc_loss(self.args, torch.cat(values),
                                  torch.cat(log_probs), torch.cat(actions),
                                  np.asarray(rewards))
            eploss += loss.item()

            # Calculate gradient
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.local_net.parameters(), 40)

            # sync gradients with global network
            for param, shared_param in zip(self.local_net.parameters(),
                                           self.global_net.parameters()):
                if shared_param.grad is None:
                    shared_param._grad = param.grad

            # Backpropagation
            self.optimizer.step()
예제 #7
0
def _main(args):
    ############## Hyperparameters ##############
    # env_name = "BipedalWalker-v2"
    env_name = 'Duckietown-loop_empty-v0'
    render = False

    lr = 0.0003  # parameters for Adam optimizer
    betas = (0.9, 0.999)

    random_seed = None

    print(args)
    #############################################

    # creating environment
    env = launch_env()
    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")
    # state_dim = env.observation_space.shape[0]
    state_dim = env.observation_space.shape
    state_dim = functools.reduce(operator.mul, state_dim, 1)
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    if random_seed:
        print("Random Seed: {}".format(random_seed))
        torch.manual_seed(random_seed)
        env.seed(random_seed)
        np.random.seed(random_seed)

    memory = Memory()
    ppo = PPO(state_dim, action_dim, args.action_std, lr, betas, args.gamma,
              args.K_epochs, args.eps_clip, max_action, args.batch_size)
    print(lr, betas)

    # logging variables
    running_reward = 0
    avg_length = 0
    time_step = 0
    episode_reward = 0
    # stats = pd.DataFrame(columns = ["Episode", "Length", "Reward"])
    stats = []
    with open("PPO_stats.csv", 'w') as statsfile:
        statsfile.write("Epoch, Timesteps, Reward\n")
    # training loop
    for i_episode in range(1, args.max_episodes + 1):
        state = env.reset()
        for t in range(args.max_timesteps):
            time_step += 1
            # Running policy_old:
            action = ppo.select_action(state, memory)
            state, reward, done, _ = env.step(action)

            # Saving reward and is_terminals:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)

            # update if its time
            if time_step % args.update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
                time_step = 0
            episode_reward += reward
            if render:
                env.render()
            if done:
                break

        avg_length += t
        # stats = stats.append({"Episode" : i_episode, "Length" : t, "Reward" : episode_reward}, ignore_index=True)
        stats.append((i_episode, t, episode_reward))
        running_reward += episode_reward
        episode_reward = 0

        if i_episode % args.store_interval == 0:
            torch.save(ppo.policy.state_dict(),
                       './PPO_continuous_{}.pth'.format(env_name))
            # stats.to_csv("PPO_stats.csv", index=False) #This line does not work on Google Colab!
            with open("PPO_stats.csv", 'a') as statsfile:
                for eps, ts, rwd in stats:
                    statsfile.write("%d, %d, %f\n" % (eps, ts, rwd))
            stats = []

        # logging
        if i_episode % args.log_interval == 0:
            avg_length = int(avg_length / args.log_interval)
            running_reward = int((running_reward / args.log_interval))

            print('Episode {} \t Avg length: {} \t Avg reward: {}'.format(
                i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
예제 #8
0
def test():
    ############## Hyperparameters ##############
    # env_name = "BipedalWalker-v2"
    env_name = 'Duckietown-loop_empty-v0'
    # env = gym.make(env_name)
    # creating environment
    env = launch_env()
    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")
    # state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    state_dim = env.observation_space.shape
    state_dim = functools.reduce(operator.mul, state_dim, 1)
    max_action = float(env.action_space.high[0])

    n_episodes = 3  # num of episodes to run
    max_timesteps = 500  # max timesteps in one episode
    render = True  # render the environment
    save_gif = False  # png images are saved in gif folder

    # filename and directory to load model from
    filename = "PPO_continuous_" + env_name + ".pth"
    directory = "./preTrained/"

    action_std = 0.05  # constant std for action distribution (Multivariate Normal)
    K_epochs = 80  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    gamma = 0.99  # discount factor

    lr = 0.0003  # parameters for Adam optimizer
    betas = (0.9, 0.999)
    #############################################

    memory = Memory()
    ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs,
              eps_clip, max_action, 32)
    ppo.policy_old.load_state_dict(
        torch.load(directory + filename, map_location=torch.device('cpu')))

    for ep in range(1, n_episodes + 1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = ppo.select_action(state, memory)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
            if save_gif:
                img = env.render(mode='rgb_array')
                img = Image.fromarray(img)
                img.save('./gif/{}.jpg'.format(t))
            if done:
                break

        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()
예제 #9
0
def _train(args):
    print("Running Expert for {} Episodes of {} Steps".format(
        args.episodes, args.steps))
    print("Training Learning for {} Epochs with Batch Size of {}".format(
        args.epochs, args.batch_size))

    env = launch_env()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    observation_shape = (None, ) + env.observation_space.shape
    action_shape = (None, ) + env.action_space.shape

    # Create an imperfect demonstrator
    expert = PurePursuitExpert(env=env)

    observations = []
    actions = []

    # let's collect our samples
    for episode in range(0, args.episodes):
        print("Starting episode", episode)
        for steps in range(0, args.steps):
            # use our 'expert' to predict the next action.
            action = expert.predict(None)
            observation, reward, done, info = env.step(action)
            observations.append(observation)
            actions.append(action)

        env.reset()

    env.close()

    actions = np.array(actions)
    observations = np.array(observations)

    model = TensorflowModel(
        observation_shape=observation_shape,  # from the logs we've got
        action_shape=action_shape,  # same
        graph_location=args.
        model_directory,  # where do we want to store our trained models
        seed=args.
        seed  # to seed all random operations in the model (e.g., dropout)
    )

    for i in range(args.epochs):
        # we defined the batch size, this can be adjusted according to your computing resources
        loss = None
        for batch in range(0, len(observations), args.batch_size):
            print("Training batch", batch)
            loss = model.train(observations=observations[batch:batch +
                                                         args.batch_size],
                               actions=actions[batch:batch + args.batch_size])

        # every 10 epochs, we store the model we have
        if i % 10 == 0:
            model.commit()

    print("Training complete!")
def _train(args):
    # Ensure that multiprocessing works properly without deadlock...
    if sys.version_info[0] > 2:
        mp.set_start_method('spawn')

    env = launch_env()
    # env = ResizeWrapper(env)
    # env = NormalizeWrapper(env)
    env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
    # env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    env = DiscreteWrapper_a6(env)

    # Set seeds
    seed(args.seed)

    logger = Logger("models")
    ckpt_dir, ckpt_path, log_dir = logger.get_log_dirs()

    shape_obs_space = env.observation_space.shape  # (3, 120, 160)
    shape_action_space = env.action_space.n  # 3

    print("Initializing Global Network")
    global_net = a3c.Net(channels=1, num_actions=shape_action_space)
    global_net.share_memory()  # share the global parameters in multiprocessing
    optimizer = CustomOptimizer.SharedAdam(global_net.parameters(),
                                           lr=args.learning_rate)
    info = {
        k: torch.DoubleTensor([0]).share_memory_()
        for k in ['run_epr', 'run_loss', 'episodes', 'frames']
    }

    if args.model_file is not None:
        cwd = os.getcwd()
        filepath = os.path.join(cwd, args.model_dir, args.model_file)
        checkpoint = torch.load(filepath)
        global_net.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        info = checkpoint['info']
        print('Loaded model:', args.model_file)

    print("Instantiating %i workers" % args.num_workers)

    workers = [
        a3c.Worker(global_net,
                   optimizer,
                   args,
                   info,
                   identifier=i,
                   logger=logger) for i in range(args.num_workers)
    ]

    print("Start training...")

    interrupted = False

    for w in workers:
        w.daemon = True
        w.start()

    try:
        [w.join() for w in workers]
    except KeyboardInterrupt:
        [w.terminate() for w in workers]
        interrupted = True

    if not interrupted or args.save_on_interrupt:
        print("Finished training.")

        if args.save_models:

            path = os.path.join(ckpt_dir, 'model-final.pth')

            torch.save(
                {
                    'model_state_dict': global_net.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'info': info
                }, path)

            print("Saved model to:", f"{ckpt_dir}/model-final")
예제 #11
0
def _train(args):
    env = launch_env()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    observation_shape = (None, ) + env.observation_space.shape
    action_shape = (None, ) + env.action_space.shape

    # Create an imperfect demonstrator
    expert = PurePursuitExpert(env=env)

    observations = []
    actions = []

    # let's collect our samples
    for episode in range(0, args.episodes):
        print("Starting episode", episode)
        for steps in range(0, args.steps):
            # use our 'expert' to predict the next action.
            action = expert.predict(None)
            observation, reward, done, info = env.step(action)
            observations.append(observation)
            actions.append(action)
        env.reset()
    env.close()

    actions = np.array(actions)
    observations = np.array(observations)

    # model = Model(action_dim=2, max_action=1.)
    model = Generator(action_dim=2)
    # state_dict = torch.load('models/G_imitate_2.pt', map_location=device)
    # model.load_state_dict(state_dict)
    model.train().to(device)

    # weight_decay is L2 regularization, helps avoid overfitting
    optimizer = optim.SGD(model.parameters(), lr=0.0004, weight_decay=1e-3)

    avg_loss = 0
    for epoch in range(args.epochs):
        optimizer.zero_grad()

        batch_indices = np.random.randint(0, observations.shape[0],
                                          (args.batch_size))
        obs_batch = torch.from_numpy(
            observations[batch_indices]).float().to(device)
        act_batch = torch.from_numpy(actions[batch_indices]).float().to(device)

        model_actions = model(obs_batch)

        loss = (model_actions - act_batch).norm(2).mean()
        loss.backward()
        optimizer.step()

        loss = loss.data.item()
        avg_loss = avg_loss * 0.995 + loss * 0.005

        print('epoch %d, loss=%.3f' % (epoch, avg_loss))

        # Periodically save the trained model
        if epoch % 200 == 0:
            torch.save(model.state_dict(),
                       '{}/G_imitate.pt'.format(args.model_directory))