示例#1
0
文件: test.py 项目: nutorbit/SAC
def main(path):
    env = gym.make('CarRacing-v0')
    model = SAC(env.observation_space, env.action_space)

    actor, critic = torch.load(path)

    model.load_model(actor, critic)

    while True:
        obs = env.reset(random_position=False)
        done = False
        rews = []
        while not done:
            act = model.select_action(obs, evaluate=True)
            obs, rew, done, _ = env.step(act)
            rews.append(rew)
        print(np.sum(rews))
示例#2
0
def main(args=None):
    if args is None:
        args = readParser()

    # Initial environment
    env = gym.make(args.env_name)

    job_name = 'MBPO_{}_{}_{}'.format(args.env_name, args.model_type,
                                      args.seed)
    writer = SummaryWriter("tensorboard/{}".format(job_name))
    writer.add_text(
        'hyperparameters', "|param|value|\n|-|-|\n%s" %
        ('\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ReplayMemory(args.replay_size)
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ReplayMemory(new_pool_size)

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agent, env_pool, model_pool, writer)
示例#3
0
def test_policy_dependent_models(args, env, state_size, action_size,
                                 env_sampler, writer):
    save_freq = args.save_model_freq
    checkpoint_epochs = np.arange(0, args.num_epoch, save_freq)
    model_policy_return_dict = {}
    for model_epoch in checkpoint_epochs:
        dynamics_model_checkpoint = torch.load(args.save_model_path +
                                               'EnsembleDynamicsModel_' +
                                               str(int(model_epoch)) + '.pt')
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
        env_model.ensemble_model.load_state_dict(
            dynamics_model_checkpoint['dynamics_model_state_dict'])
        for policy_epoch in checkpoint_epochs:
            policy_network_checkpoint = torch.load(args.save_policy_path +
                                                   'PolicyNetwork_' +
                                                   str(int(policy_epoch)) +
                                                   '.pt')
            agent = SAC(env.observation_space.shape[0], env.action_space, args)
            agent.policy.load_state_dict(
                policy_network_checkpoint['policy_model_state_dict'])
            avg_episode_reward = []
            for i in range(args.num_eval_episode):
                env_sampler.current_state = None
                sum_reward = 0
                done = False
                counter = 0
                while not done:
                    cur_state, action, next_state, reward, done, info = env_sampler.sample(
                        agent, eval_t=True)
                    sum_reward += reward
                    counter += 1
                logging.info(
                    'Policy epoch{} | DynamicsModel epoch{} | number of steps: {} | inner eval num: {} | sum reward: {}'
                    .format(counter, i, sum_reward))
                avg_episode_reward.append(sum_reward)
                writer.add_scalar(
                    'returns/mean_eval_return_model_{}_policy_{}'.format(
                        model_epoch, policy_epoch), sum_reward, i)
            mean_episode_reward = torch.mean(torch.tensor(avg_episode_reward))
            std_episode_reward = torch.std(torch.tensor(avg_episode_reward))
            model_policy_return_dict['model_{}_policy_{}'.format(
                model_epoch,
                policy_epoch)] = [mean_episode_reward, std_episode_reward]
    with open('test_policy_dependent_results_2/mean_std_evaluated_policy.json',
              'w') as f:
        json.dump(model_policy_return_dict, f)
    f.close()
示例#4
0
def main():
    logging.basicConfig(filename=time.strftime("%Y%m%d-%H%M%S") + '_train.log',
                        level=logging.INFO)

    args = readParser()

    # Initial environment
    env = gym.make(args.env_name)

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agents ensemble
    agents = []
    for _ in range(args.num_agents):
        agent = SAC(env.observation_space.shape[0], env.action_space, args)
        agents.append(agent)

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = Ensemble_Model(args.num_networks, args.num_elites,
                                   state_size, action_size, args.reward_size,
                                   args.pred_hidden_size)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ModelReplayMemory(args.replay_size)
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ModelReplayMemory(new_pool_size)

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agents, env_pool, model_pool)
示例#5
0
def main(args=None):
    if args is None:
        args = readParser()

    # Initial environment
    env = gym.make(args.env_name)

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ReplayMemory(args.replay_size)
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ReplayMemory(new_pool_size)

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agent, env_pool, model_pool)
示例#6
0
def train(
        seed: int = 69,
        batch_size: int = 256,
        num_steps: int = 5000000,
        updates_per_step: int = 1,
        start_steps: int = 100000,
        replay_size: int = 1000000,
        eval: bool = True,
        eval_interval: int = 50,
        accelerated_exploration: bool = True,
        save_models: bool = True,
        load_models: bool = True,
        save_memory: bool = True,
        load_memory: bool = False,
        path_to_actor: str = "./models/sac_actor_carracer_klein_6_24_18.pt",
        path_to_critic: str = "./models/sac_critic_carracer_klein_6_24_18.pt",
        path_to_buffer: str = "./memory/buffer_klein_6_24_18.pkl"):
    """
    ## The train function consist of:  
    
    - Setting up the environment, agent and replay buffer  
    - Logging hyperparameters and training results  
    - Loading previously saved actor and critic models  
    - Training loop  
    - Evaluation (every *eval_interval* episodes)  
    - Saving actor and critic models  
        
    ## Parameters:  
    
    - **seed** *(int)*: Seed value to generate random numbers.  
    - **batch_size** *(int)*: Number of samples that will be propagated through the Q, V, and policy network.  
    - **num_steps** *(int)*: Number of steps that the agent takes in the environment. Determines the training duration.   
    - **updates_per_step** *(int)*: Number of network parameter updates per step in the environment.  
    - **start_steps** *(int)*:  Number of steps for which a random action is sampled. After reaching *start_steps* an action
    according to the learned policy is chosen.
    - **replay_size** *(int)*: Size of the replay buffer.  
    - **eval** *(bool)*:  If *True* the trained policy is evaluated every *eval_interval* episodes.
    - **eval_interval** *(int)*: Interval of episodes after which to evaluate the trained policy.    
    - **accelerated_exploration** *(bool)*: If *True* an action with acceleration bias is sampled.  
    - **save_memory** *(bool)*: If *True* the experience replay buffer is saved to the harddrive.  
    - **save_models** *(bool)*: If *True* actor and critic models are saved to the harddrive.  
    - **load_models** *(bool)*: If *True* actor and critic models are loaded from *path_to_actor* and *path_to_critic*.  
    - **path_to_actor** *(str)*: Path to actor model.  
    - **path_to_critic** *(str)*: Path to critic model.  
    
    """
    # Environment
    env = gym.make("CarRacing-v0")
    torch.manual_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # NOTE: ALWAYS CHECK PARAMETERS BEFORE TRAINING
    agent = SAC(env.action_space,
                policy="Gaussian",
                gamma=0.99,
                tau=0.005,
                lr=0.0003,
                alpha=0.2,
                automatic_temperature_tuning=True,
                batch_size=batch_size,
                hidden_size=512,
                target_update_interval=2,
                input_dim=32)

    # Memory
    memory = ReplayMemory(replay_size)
    if load_memory:
        # load memory and deactivate random exploration
        memory.load(path_to_buffer)

    if load_memory or load_models:
        start_steps = 0

    # Training Loop
    total_numsteps = 0
    updates = 0

    # Log Settings and training results
    date = datetime.now()
    log_dir = Path(f"runs/{date.year}_SAC_{date.month}_{date.day}_{date.hour}")

    writer = SummaryWriter(log_dir=log_dir)

    settings_msg = (
        f"Training SAC for {num_steps} steps"
        "\n\nTRAINING SETTINGS:\n"
        f"Seed={seed}, Batch size: {batch_size}, Updates per step: {updates_per_step}\n"
        f"Accelerated exploration: {accelerated_exploration}, Start steps: {start_steps}, Replay size: {replay_size}"
        "\n\nALGORITHM SETTINGS:\n"
        f"Policy: {agent.policy_type}, Automatic temperature tuning: {agent.automatic_temperature_tuning}\n"
        f"Gamma: {agent.gamma}, Tau: {agent.tau}, Alpha: {agent.alpha}, LR: {agent.lr}\n"
        f"Target update interval: {agent.target_update_interval}, Latent dim: {agent.input_dim}, Hidden size: {agent.hidden_size}"
    )
    with open(log_dir / "settings.txt", "w") as file:
        file.write(settings_msg)

    if load_models:
        try:
            agent.load_model(path_to_actor, path_to_critic)
        except FileNotFoundError:
            warnings.warn(
                "Couldn't locate models in the specified paths. Training from scratch.",
                RuntimeWarning)

    for i_episode in itertools.count(1):
        episode_reward = 0
        episode_steps = 0
        done = False
        state = env.reset()
        state = process_observation(state)
        state = encoder.sample(state)
        # choose random starting position for the car
        position = np.random.randint(len(env.track))
        env.car = Car(env.world, *env.track[position][1:4])

        if accelerated_exploration:
            # choose random starting position for the car
            # position = np.random.randint(len(env.track))
            # env.car = Car(env.world, *env.track[position][1:4])
            # Sample random action
            action = env.action_space.sample()

        while not done:
            if total_numsteps < start_steps and not load_models:
                # sample action with acceleration bias if accelerated_action = True
                if accelerated_exploration:
                    action = generate_action(action)
                else:
                    action = env.action_space.sample()
            else:
                action = agent.select_action(state)

            if len(memory) > batch_size:
                # Number of updates per step in environment
                for _ in range(updates_per_step):
                    # Update parameters of all the networks
                    critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(
                        memory, batch_size, updates)
                    writer.add_scalar('loss/critic_1', critic_1_loss, updates)
                    writer.add_scalar('loss/critic_2', critic_2_loss, updates)
                    writer.add_scalar('loss/policy', policy_loss, updates)
                    writer.add_scalar('loss/entropy_loss', ent_loss, updates)
                    writer.add_scalar('entropy_temperature/alpha', alpha,
                                      updates)
                    updates += 1

            next_state, reward, done, _ = env.step(action)  # Step
            next_state = process_observation(next_state)
            next_state = encoder.sample(next_state)
            episode_steps += 1
            total_numsteps += 1
            episode_reward += reward

            # Ignore the "done" signal if it comes from hitting the time horizon.
            # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
            mask = 1 if episode_steps == env._max_episode_steps else float(
                not done)

            memory.push(state, action, reward, next_state,
                        mask)  # Append transition to memory

            state = next_state

        if total_numsteps > num_steps:
            break

        writer.add_scalar('reward/train', episode_reward, i_episode)

        print(
            f"Episode: {i_episode}, total numsteps: {total_numsteps}, episode steps: {episode_steps}, reward: {round(episode_reward, 2)}"
        )

        if i_episode % eval_interval == 0 and eval == True:
            avg_reward = 0.
            episodes = 10

            if save_models:
                agent.save_model(
                    "carracer",
                    f"{getuser()}_{date.month}_{date.day}_{date.hour}")

            for _ in range(episodes):
                state = env.reset()
                state = process_observation(state)
                state = encoder.sample(state)

                episode_reward = 0
                done = False
                while not done:
                    action = agent.select_action(state, eval=True)

                    next_state, reward, done, _ = env.step(action)
                    next_state = process_observation(next_state)
                    next_state = encoder.sample(next_state)
                    episode_reward += reward

                    state = next_state
                avg_reward += episode_reward
            avg_reward /= episodes

            if save_models:
                agent.save_model(
                    "carracer",
                    f"{getuser()}_{date.month}_{date.day}_{date.hour}")
            if save_memory:
                memory.save(
                    f"buffer_{getuser()}_{date.month}_{date.day}_{date.hour}")

            writer.add_scalar("avg_reward/test", avg_reward, i_episode)

            print("-" * 40)
            print(
                f"Test Episodes: {episodes}, Avg. Reward: {round(avg_reward, 2)}"
            )
            print("-" * 40)

    env.close()
示例#7
0
def main(args=None):
    if args is None:
        args = readParser()

    save_model_dir = os.path.join(args.save_dir, args.env_name,
                                  'dynamics_model')
    save_policy_dir = os.path.join(args.save_dir, args.env_name,
                                   'policy_network')
    save_env_buffer_dir = os.path.join(args.save_dir, args.env_name,
                                       'env_buffer')
    save_dynamics_buffer_dir = os.path.join(args.save_dir, args.env_name,
                                            'dynamics_buffer')
    if not os.path.exists(save_model_dir):
        os.makedirs(save_model_dir)
    if not os.path.exists(save_policy_dir):
        os.makedirs(save_policy_dir)
    if not os.path.exists(save_env_buffer_dir):
        os.makedirs(save_env_buffer_dir)
    if not os.path.exists(save_dynamics_buffer_dir):
        os.makedirs(save_dynamics_buffer_dir)

    # Initial environment
    if 'Ant' in args.env_name:
        args.env_name = new_env.register_mbpo_environments()[0]
        print('Loaded TruncatedObs-version of the Ant environment: {}'.format(
            args.env_name))
    # else:
    #     env_name = args.env_name
    env = gym.make(args.env_name)

    job_name = 'MBPO_test_policy_dependent_models_{}_{}_{}'.format(
        args.env_name, args.model_type, args.seed)
    writer = SummaryWriter(
        str(os.path.join(args.save_dir, 'tensorboard', job_name)))
    writer.add_text(
        'hyperparameters', "|param|value|\n|-|-|\n%s" %
        ('\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ReplayMemory(args.replay_size)
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ReplayMemory(new_pool_size)

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agent, env_pool, model_pool, writer,
          save_model_dir, save_policy_dir, save_env_buffer_dir,
          save_dynamics_buffer_dir)

    print('Training complete!')
    print(
        '---------------------------------------------------------------------'
    )
    print(
        'Start evaluating different policies at different model checkpoints...'
    )
    print(
        '---------------------------------------------------------------------'
    )
    test_policy_dependent_models(args, env, state_size, action_size,
                                 args.save_model_freq,
                                 args.save_model_freq * 6, save_model_dir,
                                 save_policy_dir)
示例#8
0
def test_policy_dependent_models(args, env, state_size, action_size,
                                 start_eval, end_eval, save_model_dir,
                                 save_policy_dir):
    save_freq = args.save_model_freq
    checkpoint_epochs = np.arange(start_eval, end_eval, save_freq)
    # checkpoint_epochs = np.arange(20, 40, 2)
    # checkpoint_epochs = [20, 26, 32, 38]
    # checkpoint_epochs = np.append(checkpoint_epochs, args.num_epoch-1)
    model_policy_return_dict = {}
    state_error_dict = {}
    reward_error_dict = {}
    with open(
            os.path.join(
                args.save_dir,
                'scaler_mu_std_{}.pkl'.format(str(int(args.num_epoch - 1)))),
            'rb') as f:
        mean, std = pickle.load(f)
    for model_epoch in checkpoint_epochs:
        dynamics_model_checkpoint = torch.load(
            str(
                os.path.join(
                    save_model_dir,
                    'EnsembleDynamicsModel_{}.pt'.format(model_epoch))))
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
        env_model.ensemble_model.load_state_dict(
            dynamics_model_checkpoint['dynamics_model_state_dict'])
        env_model.scaler.mu = mean
        env_model.scaler.std = std
        print('dynamics_model_{} loaded'.format(model_epoch))
        predict_env = PredictEnv(env_model, args.env_name, args.model_type)
        predict_env_sampler = Predict_EnvSample(env, predict_env)
        for policy_epoch in checkpoint_epochs:
            policy_network_checkpoint = torch.load(
                str(
                    os.path.join(save_policy_dir,
                                 'PolicyNetwork_{}.pt'.format(model_epoch))))
            agent = SAC(env.observation_space.shape[0], env.action_space, args)
            agent.policy.load_state_dict(
                policy_network_checkpoint['policy_model_state_dict'])
            avg_episode_reward = []
            for i in range(args.num_eval_episode):
                predict_env_sampler.current_state = None
                sum_reward = 0
                done = False
                counter = 0
                state_error = []
                reward_error = []
                while not done and counter < args.epoch_length:
                    cur_state, action, next_state, reward, done, info, model_error = predict_env_sampler.sample(
                        agent, eval_t=True, ret_true_reward=False)
                    sum_reward += reward
                    counter += 1
                    state_error.append(model_error[0])
                    reward_error.append(model_error[1])
                # logging.info('Policy epoch{} | DynamicsModel epoch{} | number of steps: {} | inner eval num: {} | sum reward: {} | model_error: {}'.format(policy_epoch, model_epoch, counter, i, sum_reward, np.sum(model_error_list)))
                avg_episode_reward.append(sum_reward)
                # writer.add_scalar('returns/mean_eval_return_model_{}_policy_{}'.format(model_epoch, policy_epoch), sum_reward, i)
            mean_episode_reward = torch.mean(
                torch.tensor(avg_episode_reward) * 1.)
            std_episode_reward = torch.std(
                torch.tensor(avg_episode_reward) * 1.)
            model_policy_return_dict['model_{}_policy_{}'.format(
                model_epoch, policy_epoch)] = [
                    mean_episode_reward.item(),
                    std_episode_reward.item()
                ]
            state_error_dict['model_{}_policy_{}'.format(
                model_epoch, policy_epoch)] = state_error
            reward_error_dict['model_{}_policy_{}'.format(
                model_epoch, policy_epoch)] = reward_error
            print(
                'model epoch: {} | policy epoch: {} | mean return: {:.3f} | state error: {:.2f} | reward error: {:.2f} | total steps: {} | Done'
                .format(model_epoch, policy_epoch, mean_episode_reward,
                        np.mean(state_error), np.mean(reward_error), counter))
    with open(
            str(
                os.path.join(
                    args.save_dir, args.env_name,
                    'model_policy_return_dict_{}_{}_{}'.format(
                        start_eval, save_freq, end_eval))), 'w') as f:
        json.dump(model_policy_return_dict, f)
    with open(
            str(
                os.path.join(
                    args.save_dir, args.env_name,
                    'state_error_dict_{}_{}_{}.json'.format(
                        start_eval, save_freq, end_eval))), 'w') as f:
        json.dump(state_error_dict, f)
    with open(
            str(
                os.path.join(
                    args.save_dir, args.env_name,
                    'reward_error_dict_{}_{}_{}.json'.format(
                        start_eval, save_freq, end_eval))), 'w') as f:
        json.dump(
            {
                k: np.array(v).astype(np.float64).tolist()
                for k, v in reward_error_dict.items()
            }, f)
    f.close()
示例#9
0
def test():
    # Environment
    # env = NormalizedActions(gym.make(args.env_name))
    #env = gym.make(args.env_name)
    args = get_args()
    args.eval = True
    set_env_arg(t_type=args.t_type,
                n_type=args.n_type,
                r_type=args.r_type,
                proj=str_to_bool(args.proj),
                cam_r_noise=str_to_bool(args.cam_r_noise),
                cam_t_noise=str_to_bool(args.cam_t_noise),
                cam_in_noise=str_to_bool(args.cam_in_noise),
                test=str_to_bool(args.test))

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    # env.seed(args.seed)

    # Agent
    agent = SAC(env.state_dim, env.action_space, args)
    agent.load_model('models/sac_actor_crane_', 'models/sac_critic_crane_')

    # TesnorboardX
    writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
        args.policy, "autotune" if args.automatic_entropy_tuning else ""))

    # Memory
    memory = ReplayMemory(args.replay_size)

    # Training Loop
    total_numsteps = 0
    updates = 0

    for ep in range(MAX_EP_STEPS):
        state, gt = env.reset()
        episode_reward = 0

        for t in range(MAX_STEP):
            # while True:
            env.render()

            action = agent.select_action(state)

            next_state, reward, done, _ = env.step(action)  # Step
            if done:
                mask = 1
            else:
                mask = 0
            memory.push(state, action, reward, next_state,
                        mask)  # Append transition to memory
            """# store experience
                    trans = np.hstack((s, a, [r], s_))
                    outfile = exp_path + '/' + str(ep) + '_' + str(t)
                    np.save(outfile, trans)
                    """

            state = next_state

            episode_reward += reward

            if t == MAX_STEP - 1 or done:

                # if done:
                result = '| done' if done else '| ----'
                print(
                    'Ep:',
                    ep,
                    result,
                    '| R: %i' % int(episode_reward),
                    '| Explore: %.2f' % var,
                )

                out_s = 'Ep: ' + str(ep) + ' result: ' + str(done) + \
                    " R: " + str(episode_reward) + " Explore " + str(var) + " \n"
                """
示例#10
0
                    help='Value target update per no. of updates per step (default: 1)')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N',
                    help='size of replay buffer (default: 10000000)')
parser.add_argument('--cuda', action="store_true",
                    help='run on CUDA (default: False)')
args = parser.parse_args()

# Environment
# env = NormalizedActions(gym.make(args.env_name))
env = gym.make(args.env_name)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
env.seed(args.seed)

# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)

#TesnorboardX
writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
                                                             args.policy, "autotune" if args.automatic_entropy_tuning else ""))

# Memory
memory = ReplayMemory(args.replay_size)

# Training Loop
total_numsteps = 0
updates = 0

for i_episode in itertools.count(1):
    episode_reward = 0
    episode_steps = 0
示例#11
0
def main(args=None):
    if args is None:
        args = readParser()

    # if not os.path.exists(args.save_model_path):
    #     os.makedirs(args.save_model_path)
    # if not os.path.exists(args.save_policy_path):
    #     os.makedirs(args.save_policy_path)

    # Initial environment
    env = gym.make(args.env_name)

    # job_name = 'MBPO_test_policy_dependent_models_{}_{}_{}'.format(args.env_name, args.model_type, args.seed)
    # writer = SummaryWriter("test_policy_dependent_results_2/tensorboard/{}".format(job_name))
    # writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % (
    #     '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)
    policy_network_checkpoint = torch.load(
        './test_policy_dependent_results_2/policy/PolicyNetwork_20.pt')
    agent.policy.load_state_dict(
        policy_network_checkpoint['policy_model_state_dict'])

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)
    dynamics_model_checkpoint = torch.load(
        './test_policy_dependent_results_2/dynamics_model/EnsembleDynamicsModel_20.pt'
    )
    env_model.ensemble_model.load_state_dict(
        dynamics_model_checkpoint['dynamics_model_state_dict'])

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ReplayMemory(args.replay_size)
    env_pool.load(
        './test_policy_dependent_results_2/env_buffer/env_buffer_20.pkl')
    env_pool.position = len(env_pool.buffer)
    # env_pool.buffer = np.array(env_pool.buffer)[~np.where(np.array(env_pool.buffer)==None)[0]]
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ReplayMemory(new_pool_size)
    model_pool.load(
        './test_policy_dependent_results_2/model_buffer/model_buffer_20.pkl')
    model_pool.position = len(model_pool.buffer)
    # model_pool.buffer = np.array(model_pool.buffer)[~np.where(np.array(model_pool.buffer)==None)[0]]

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agent, env_pool, model_pool)
示例#12
0
def main():
    if sys.platform.startswith('win'):
        # Add the _win_handler function to the windows console's handler function list
        win32api.SetConsoleCtrlHandler(_win_handler, True)
    if os.path.exists(
            os.path.join(config_file.config['config_file'], 'config.yaml')):
        config = sth.load_config(config_file.config['config_file'])
    else:
        config = config_file.config
        print(f'load config from config.')

    hyper_config = config['hyper parameters']
    train_config = config['train config']
    record_config = config['record config']

    basic_dir = record_config['basic_dir']
    last_name = record_config['project_name'] + '/' \
        + record_config['remark'] \
        + record_config['run_id']
    cp_dir = record_config['checkpoint_basic_dir'] + last_name
    cp_file = cp_dir + '/rb'
    log_dir = record_config['log_basic_dir'] + last_name
    excel_dir = record_config['excel_basic_dir'] + last_name
    config_dir = record_config['config_basic_dir'] + last_name
    sth.check_or_create(basic_dir, 'basic')
    sth.check_or_create(cp_dir, 'checkpoints')
    sth.check_or_create(log_dir, 'logs(summaries)')
    sth.check_or_create(excel_dir, 'excel')
    sth.check_or_create(config_dir, 'config')

    logger = create_logger(
        name='logger',
        console_level=logging.INFO,
        console_format='%(levelname)s : %(message)s',
        logger2file=record_config['logger2file'],
        file_name=log_dir + '\log.txt',
        file_level=logging.WARNING,
        file_format=
        '%(lineno)d - %(asctime)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s'
    )
    if train_config['train']:
        sth.save_config(config_dir, config)

    if train_config['unity_mode']:
        env = UnityEnvironment()
    else:
        env = UnityEnvironment(
            file_name=train_config['unity_file'],
            no_graphics=True if train_config['train'] else False,
            base_port=train_config['port'])
    brain_name = env.external_brain_names[0]
    brain = env.brains[brain_name]
    # set the memory use proportion of GPU
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5
    tf.reset_default_graph()
    graph = tf.Graph()
    with graph.as_default() as g:
        with tf.Session(graph=g, config=tf_config) as sess:
            logger.info('Algorithm: {0}'.format(
                train_config['algorithm'].name))
            if train_config['algorithm'] == config_file.algorithms.ppo_sep_ac:
                from ppo.ppo_base import PPO_SEP
                model = PPO_SEP(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                logger.info('PPO_SEP initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.ppo_com:
                from ppo.ppo_base import PPO_COM
                model = PPO_COM(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                logger.info('PPO_COM initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.sac:
                from sac.sac import SAC
                model = SAC(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                logger.info('SAC initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.sac_no_v:
                from sac.sac_no_v import SAC_NO_V
                model = SAC_NO_V(sess=sess,
                                 s_dim=brain.vector_observation_space_size,
                                 a_counts=brain.vector_action_space_size[0],
                                 hyper_config=hyper_config)
                logger.info('SAC_NO_V initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.ddpg:
                from ddpg.ddpg import DDPG
                model = DDPG(sess=sess,
                             s_dim=brain.vector_observation_space_size,
                             a_counts=brain.vector_action_space_size[0],
                             hyper_config=hyper_config)
                logger.info('DDPG initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.td3:
                from td3.td3 import TD3
                model = TD3(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                logger.info('TD3 initialize success.')
            recorder = Recorder(log_dir,
                                excel_dir,
                                record_config,
                                logger,
                                max_to_keep=5,
                                pad_step_number=True,
                                graph=g)
            episode = init_or_restore(cp_dir, sess, recorder, cp_file)
            try:
                if train_config['train']:
                    train_OnPolicy(
                        sess=sess,
                        env=env,
                        brain_name=brain_name,
                        begin_episode=episode,
                        model=model,
                        recorder=recorder,
                        cp_file=cp_file,
                        hyper_config=hyper_config,
                        train_config=train_config) if not train_config[
                            'use_replay_buffer'] else train_OffPolicy(
                                sess=sess,
                                env=env,
                                brain_name=brain_name,
                                begin_episode=episode,
                                model=model,
                                recorder=recorder,
                                cp_file=cp_file,
                                hyper_config=hyper_config,
                                train_config=train_config)
                    tf.train.write_graph(g,
                                         cp_dir,
                                         'raw_graph_def.pb',
                                         as_text=False)
                    export_model(cp_dir, g)
                else:
                    inference(env, brain_name, model, train_config)
            except Exception as e:
                logger.error(e)
            finally:
                env.close()
    recorder.close()
    sys.exit()
示例#13
0
def main(args=None):
    if args is None:
        args = readParser()

    if not os.path.exists(args.save_model_path):
        os.makedirs(args.save_model_path)
    if not os.path.exists(args.save_policy_path):
        os.makedirs(args.save_policy_path)

    # Initial environment
    env = gym.make(args.env_name)

    # job_name = 'MBPO_test_policy_dependent_models_{}_{}_{}'.format(args.env_name, args.model_type, args.seed)
    # writer = SummaryWriter("test_policy_dependent_results/tensorboard/{}".format(job_name))
    # writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % (
    #     '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))

    # # Set random seed
    # torch.manual_seed(args.seed)
    # np.random.seed(args.seed)
    # env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
    # else:
    #     env_model = construct_model(obs_dim=state_size, act_dim=action_size, hidden_dim=args.pred_hidden_size, num_networks=args.num_networks,
    #                                 num_elites=args.num_elites)

    # Predict environments
    # predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    # env_pool = ReplayMemory(args.replay_size)
    # # Initial pool for model
    # rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    # model_steps_per_epoch = int(1 * rollouts_per_epoch)
    # new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    # model_pool = ReplayMemory(new_pool_size)

    # Sampler of environment
    env_sampler = EnvSampler(env)

    # train(args, env_sampler, predict_env, agent, env_pool, model_pool, writer)

    print('Training complete!')
    print(
        '---------------------------------------------------------------------'
    )
    print(
        'Start evaluating different policies at different model checkpoints...'
    )
    print(
        '---------------------------------------------------------------------'
    )
    test_policy_dependent_models(args, env, state_size, action_size,
                                 env_sampler)
示例#14
0
    env.close()


env = gym.make('LunarLanderContinuous-v2')

torch.manual_seed(1)
np.random.seed(1)

s_size = env.observation_space.shape[0]
a_size = env.action_space.shape[0]

print("State Size:", s_size)
print("Action Size:", a_size)

sac = SAC(in_dim=s_size, out_dim=a_size, p_alpha=1e-3, q_alpha=1e-3)
reward = train(env,
               sac,
               epochs=300,
               episodes=1,
               steps=100,
               render=False,
               graphing=True,
               run=False)

run(sac, env, episodes=3, steps=100)
"""
plt.plot(reward/np.max(reward), label="Reward")
plt.plot(np.array(sac.q_loss)/np.max(sac.q_loss), label="Q loss")
plt.plot(np.array(sac.p_loss)/np.max(sac.p_loss), label="P loss")
plt.legend()
示例#15
0
def main():
    if sys.platform.startswith('win'):
        win32api.SetConsoleCtrlHandler(_win_handler, True)

    if train_config['unity_mode']:
        env = UnityEnvironment()
    else:
        env = UnityEnvironment(
            file_name=train_config['unity_file'],
            no_graphics=True if train_config['train'] else False,
            base_port=train_config['port'])
    brain_name = env.external_brain_names[0]
    brain = env.brains[brain_name]
    # set the memory use proportion of GPU
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5
    tf.reset_default_graph()
    graph = tf.Graph()
    with graph.as_default() as g:
        with tf.Session(graph=g, config=tf_config) as sess:
            print('Algorithm: {0}'.format(train_config['algorithm'].name))
            if train_config['algorithm'] == algorithms.ppo_sep_ac:
                from ppo.ppo_base import PPO_SEP
                model = PPO_SEP(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                print('PPO_SEP initialize success.')
            elif train_config['algorithm'] == algorithms.ppo_com:
                from ppo.ppo_base import PPO_COM
                model = PPO_COM(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                print('PPO_COM initialize success.')
            elif train_config['algorithm'] == algorithms.sac:
                from sac.sac import SAC
                model = SAC(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                print('SAC initialize success.')
            elif train_config['algorithm'] == algorithms.sac_no_v:
                from sac.sac_no_v import SAC_NO_V
                model = SAC_NO_V(sess=sess,
                                 s_dim=brain.vector_observation_space_size,
                                 a_counts=brain.vector_action_space_size[0],
                                 hyper_config=hyper_config)
                print('SAC_NO_V initialize success.')
            elif train_config['algorithm'] == algorithms.ddpg:
                from ddpg.ddpg import DDPG
                model = DDPG(sess=sess,
                             s_dim=brain.vector_observation_space_size,
                             a_counts=brain.vector_action_space_size[0],
                             hyper_config=hyper_config)
                print('DDPG initialize success.')
            elif train_config['algorithm'] == algorithms.td3:
                from td3.td3 import TD3
                model = TD3(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                print('TD3 initialize success.')
            sess.run(tf.global_variables_initializer())
            try:
                if train_config['train']:
                    train_OnPolicy(
                        sess=sess,
                        env=env,
                        brain_name=brain_name,
                        begin_episode=0,
                        model=model,
                        hyper_config=hyper_config,
                        train_config=train_config) if not train_config[
                            'use_replay_buffer'] else train_OffPolicy(
                                sess=sess,
                                env=env,
                                brain_name=brain_name,
                                begin_episode=0,
                                model=model,
                                hyper_config=hyper_config,
                                train_config=train_config)
                else:
                    inference(env, brain_name, model, train_config)
            except Exception as e:
                print(e)
            finally:
                env.close()
    sys.exit()
示例#16
0
def train_SAC(env_name,
              exp_name,
              seed,
              logdir,
              two_qf=False,
              reparam=False,
              nepochs=100,
              paras={}):
    alpha = {
        'Ant-v2': 0.1,
        'HalfCheetah-v2': 0.2,
        'Hopper-v2': 0.2,
        'Humanoid-v2': 0.05,
        'Walker2d-v2': 0.2,
        'Toddler': 0.05,
        'Adult': 0.05,
        'LunarLander': 0.1
    }.get(env_name, 0.2)

    algorithm_params = {
        'alpha': alpha,
        'batch_size': 256,
        'discount': 0.99,
        'learning_rate': 1e-3,
        'reparameterize': reparam,
        'tau': 0.01,
        'epoch_length': 1000,
        'n_epochs': nepochs,  # 500
        'two_qf': two_qf,
    }
    sampler_params = {
        'max_episode_length': 1000,
        'prefill_steps': 1000,
    }
    replay_pool_params = {
        'max_size': 1e6,
    }

    value_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    q_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    policy_params = {
        'hidden_layer_sizes': (128, 128),
    }

    logz.configure_output_dir(logdir)
    params = {
        'exp_name': exp_name,
        'env_name': env_name,
        'algorithm_params': algorithm_params,
        'sampler_params': sampler_params,
        'replay_pool_params': replay_pool_params,
        'value_function_params': value_function_params,
        'q_function_params': q_function_params,
        'policy_params': policy_params
    }
    logz.save_params(params)

    if env_name == 'Toddler' or env_name == 'Adult':
        env = CustomHumanoidEnv(template=env_name)
    elif env_name == 'LunarLander':
        env = LunarLanderContinuous(**paras)
    else:
        env = gym.envs.make(env_name)

    # Observation and action sizes
    ac_dim = env.action_space.n \
        if isinstance(env.action_space, gym.spaces.Discrete) \
        else env.action_space.shape[0]

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    q_function = nn.QFunction(name='q_function', **q_function_params)
    if algorithm_params.get('two_qf', False):
        q_function2 = nn.QFunction(name='q_function2', **q_function_params)
    else:
        q_function2 = None
    value_function = nn.ValueFunction(name='value_function',
                                      **value_function_params)
    target_value_function = nn.ValueFunction(name='target_value_function',
                                             **value_function_params)
    policy = nn.GaussianPolicy(
        action_dim=ac_dim,
        reparameterize=algorithm_params['reparameterize'],
        **policy_params)

    samplers = []
    replay_pools = []

    sampler = utils.SimpleSampler(**sampler_params)
    replay_pool = utils.SimpleReplayPool(
        observation_shape=env.observation_space.shape,
        action_shape=(ac_dim, ),
        **replay_pool_params)
    sampler.initialize(env, policy, replay_pool)
    samplers.append(sampler)
    replay_pools.append(replay_pool)

    algorithm = SAC(**algorithm_params)

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    tf_config.gpu_options.allow_growth = True  # may need if using GPU
    with tf.Session(config=tf_config):
        algorithm.build(env=env,
                        policy=policy,
                        q_function=q_function,
                        q_function2=q_function2,
                        value_function=value_function,
                        target_value_function=target_value_function)
        # algorithm_params.get('n_epochs', 1000)
        for epoch in algorithm.train(sampler,
                                     n_epochs=algorithm_params.get(
                                         'n_epochs', 100)):
            logz.log_tabular('Iteration', epoch)
            for k, v in algorithm.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in replay_pool.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in sampler.get_statistics().items():
                logz.log_tabular(k, v)
            logz.dump_tabular()
示例#17
0
def train():
    # Environment
    # env = NormalizedActions(gym.make(args.env_name))
    #env = gym.make(args.env_name)
    global var
    args = get_args()
    set_env_arg(t_type=args.t_type,
                n_type=args.n_type,
                r_type=args.r_type,
                proj=str_to_bool(args.proj),
                cam_r_noise=str_to_bool(args.cam_r_noise),
                cam_t_noise=str_to_bool(args.cam_t_noise),
                cam_in_noise=str_to_bool(args.cam_in_noise),
                test=str_to_bool(args.test))

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    # env.seed(args.seed)

    # Agent
    agent = SAC(env.state_dim, env.action_space, args)
    agent.load_model('models/sac_actor_crane70_', 'models/sac_critic_crane70_')

    # TesnorboardX
    writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
        args.policy, "autotune" if args.automatic_entropy_tuning else ""))

    # Memory
    memory = ReplayMemory(args.replay_size)

    # Training Loop
    total_numsteps = 0
    updates = 0

    for ep in range(MAX_EP_STEPS):
        state, gt = env.reset()
        episode_reward = 0

        for t in range(MAX_STEP):
            # while True:
            env.render()

            # Added exploration noise
            if ep < sample_numsteps:
                print('sample')
                action = env.action_space.sample()  # Sample random action
            else:
                # Sample action from policy
                action = agent.select_action(state)

            # add randomness to action selection for exploration
            action = np.clip(np.random.normal(action, var), *ACTION_BOUND)
            next_state, reward, done, _ = env.step(action)  # Step
            if done:
                mask = 1
            else:
                mask = 0
            memory.push(state, action, reward, next_state,
                        mask)  # Append transition to memory
            """# store experience
                    trans = np.hstack((s, a, [r], s_))
                    outfile = exp_path + '/' + str(ep) + '_' + str(t)
                    np.save(outfile, trans)
                    """

            if len(memory) > sample_numsteps * MAX_STEP:
                # Number of updates per step in environment
                var = max([var * .9999, VAR_MIN])
                for i in range(1):
                    # Update parameters of all the networks
                    critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(
                        memory, 512, updates)

                    writer.add_scalar('loss/critic_1', critic_1_loss, updates)
                    writer.add_scalar('loss/critic_2', critic_2_loss, updates)
                    writer.add_scalar('loss/policy', policy_loss, updates)
                    writer.add_scalar('loss/entropy_loss', ent_loss, updates)
                    writer.add_scalar('entropy_temprature/alpha', alpha,
                                      updates)
                    updates += 1

            state = next_state

            episode_reward += reward

            if t == MAX_STEP - 1 or done:
                if len(memory) > sample_numsteps * MAX_STEP:
                    for i in range(10):
                        # Update parameters of all the networks
                        critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(
                            memory, 512, updates)

                        writer.add_scalar('loss/critic_1', critic_1_loss,
                                          updates)
                        writer.add_scalar('loss/critic_2', critic_2_loss,
                                          updates)
                        writer.add_scalar('loss/policy', policy_loss, updates)
                        writer.add_scalar('loss/entropy_loss', ent_loss,
                                          updates)
                        writer.add_scalar('entropy_temprature/alpha', alpha,
                                          updates)
                        updates += 1

                # if done:
                result = '| done' if done else '| ----'
                print(
                    'Ep:',
                    ep,
                    result,
                    '| R: %i' % int(episode_reward),
                    '| Explore: %.2f' % var,
                )

                out_s = 'Ep: ' + str(ep) + ' result: ' + str(done) + \
                    " R: " + str(episode_reward) + " Explore " + str(var) + " \n"
                break
                """
                    f = open(log_path, "a+")
                    f.write(out_s)
                    f.close()
                    """
            if ep % 10 == 0:
                agent.save_model(env_name='crane' + str(ep))

    agent.save_model(env_name='crane')
示例#18
0
def main(seed: int = 111,
         batch_size: int = 512,
         episodes: int = 100,
         path_to_actor: str = "models/sac_actor_carracer_klein_6_24_18.pt",
         path_to_critic: str = "models/sac_critic_carracer_klein_6_24_18.pt"):
    """
    Function for displaying a trained Soft Actor-Critic agent.  
    
    ## Parameters:  
    
    - **seed** *(int=111)*: RNG seed determining the generated tracks.  
    - **batch_size** *(batch_size=512)*: Batch size needed for SAC algorithm initialization.  
    - **episodes** *(int=100)*: Number of episodes in the evaluation run.  
    - **path_to_actor** *(str)*: Path to saved SAC actor model.  
    - **path_to_critic** *(str)*: Path to saved SAC critic model.    
    """

    # Environment
    env = gym.make("CarRacing-v0")
    torch.manual_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # Agent
    agent = SAC(env.action_space,
                policy="Gaussian",
                gamma=0.99,
                tau=0.005,
                lr=0.0003,
                alpha=0.2,
                automatic_temperature_tuning=False,
                batch_size=batch_size,
                hidden_size=256,
                target_update_interval=1,
                input_dim=32)

    #load models and throws error if the paths are wrong
    agent.load_model(path_to_actor, path_to_critic)

    avg_reward = 0.
    rewards = []

    for i_episode in range(episodes):
        episode_reward = 0
        # Get initial observation
        state = env.reset()
        state = process_observation(state)
        state = encoder.sample(state)
        done = False
        for t in range(1000):
            # render the environment at each step
            env.render()
            # move the car using the policy actions
            action = agent.select_action(state, eval=True)
            state, reward, done, _ = env.step(action)
            state = process_observation(state)
            state = encoder.sample(state)
            episode_reward += reward

            if done:
                print("Episode finished after {} timesteps".format(t + 1))
                break

        rewards.append(episode_reward)
        avg_reward += episode_reward
    # Close the rendering

    np.save("rewards.npy", rewards)
    env.close()
    avg_reward /= episodes
    print("-" * 40)
    print(f"Test Episodes: {episodes}, Avg. Reward: {round(avg_reward, 2)}")
    print("-" * 40)