示例#1
0
def resize_model_pool(args, rollout_length, model_pool):
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(rollout_length * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch

    sample_all = model_pool.return_all()
    new_model_pool = ReplayMemory(new_pool_size)
    new_model_pool.push_batch(sample_all)

    return new_model_pool
示例#2
0
def main(args=None):
    if args is None:
        args = readParser()

    # Initial environment
    env = gym.make(args.env_name)

    job_name = 'MBPO_{}_{}_{}'.format(args.env_name, args.model_type,
                                      args.seed)
    writer = SummaryWriter("tensorboard/{}".format(job_name))
    writer.add_text(
        'hyperparameters', "|param|value|\n|-|-|\n%s" %
        ('\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ReplayMemory(args.replay_size)
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ReplayMemory(new_pool_size)

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agent, env_pool, model_pool, writer)
示例#3
0
def main(args=None):
    if args is None:
        args = readParser()

    # Initial environment
    env = gym.make(args.env_name)

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ReplayMemory(args.replay_size)
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ReplayMemory(new_pool_size)

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agent, env_pool, model_pool)
示例#4
0
def main():
    logging.basicConfig(filename=time.strftime("%Y%m%d-%H%M%S") + '_train.log',
                        level=logging.INFO)

    args = readParser()

    # Initial environment
    env = make_env(args.env_name)

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = Ensemble_Model(args.num_networks, args.num_elites,
                                   state_size, action_size, args.reward_size,
                                   args.pred_hidden_size)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ReplayMemory(args.replay_size)
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ReplayMemory(new_pool_size)

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agent, env_pool, model_pool)
示例#5
0
def train(
        seed: int = 69,
        batch_size: int = 256,
        num_steps: int = 5000000,
        updates_per_step: int = 1,
        start_steps: int = 100000,
        replay_size: int = 1000000,
        eval: bool = True,
        eval_interval: int = 50,
        accelerated_exploration: bool = True,
        save_models: bool = True,
        load_models: bool = True,
        save_memory: bool = True,
        load_memory: bool = False,
        path_to_actor: str = "./models/sac_actor_carracer_klein_6_24_18.pt",
        path_to_critic: str = "./models/sac_critic_carracer_klein_6_24_18.pt",
        path_to_buffer: str = "./memory/buffer_klein_6_24_18.pkl"):
    """
    ## The train function consist of:  
    
    - Setting up the environment, agent and replay buffer  
    - Logging hyperparameters and training results  
    - Loading previously saved actor and critic models  
    - Training loop  
    - Evaluation (every *eval_interval* episodes)  
    - Saving actor and critic models  
        
    ## Parameters:  
    
    - **seed** *(int)*: Seed value to generate random numbers.  
    - **batch_size** *(int)*: Number of samples that will be propagated through the Q, V, and policy network.  
    - **num_steps** *(int)*: Number of steps that the agent takes in the environment. Determines the training duration.   
    - **updates_per_step** *(int)*: Number of network parameter updates per step in the environment.  
    - **start_steps** *(int)*:  Number of steps for which a random action is sampled. After reaching *start_steps* an action
    according to the learned policy is chosen.
    - **replay_size** *(int)*: Size of the replay buffer.  
    - **eval** *(bool)*:  If *True* the trained policy is evaluated every *eval_interval* episodes.
    - **eval_interval** *(int)*: Interval of episodes after which to evaluate the trained policy.    
    - **accelerated_exploration** *(bool)*: If *True* an action with acceleration bias is sampled.  
    - **save_memory** *(bool)*: If *True* the experience replay buffer is saved to the harddrive.  
    - **save_models** *(bool)*: If *True* actor and critic models are saved to the harddrive.  
    - **load_models** *(bool)*: If *True* actor and critic models are loaded from *path_to_actor* and *path_to_critic*.  
    - **path_to_actor** *(str)*: Path to actor model.  
    - **path_to_critic** *(str)*: Path to critic model.  
    
    """
    # Environment
    env = gym.make("CarRacing-v0")
    torch.manual_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # NOTE: ALWAYS CHECK PARAMETERS BEFORE TRAINING
    agent = SAC(env.action_space,
                policy="Gaussian",
                gamma=0.99,
                tau=0.005,
                lr=0.0003,
                alpha=0.2,
                automatic_temperature_tuning=True,
                batch_size=batch_size,
                hidden_size=512,
                target_update_interval=2,
                input_dim=32)

    # Memory
    memory = ReplayMemory(replay_size)
    if load_memory:
        # load memory and deactivate random exploration
        memory.load(path_to_buffer)

    if load_memory or load_models:
        start_steps = 0

    # Training Loop
    total_numsteps = 0
    updates = 0

    # Log Settings and training results
    date = datetime.now()
    log_dir = Path(f"runs/{date.year}_SAC_{date.month}_{date.day}_{date.hour}")

    writer = SummaryWriter(log_dir=log_dir)

    settings_msg = (
        f"Training SAC for {num_steps} steps"
        "\n\nTRAINING SETTINGS:\n"
        f"Seed={seed}, Batch size: {batch_size}, Updates per step: {updates_per_step}\n"
        f"Accelerated exploration: {accelerated_exploration}, Start steps: {start_steps}, Replay size: {replay_size}"
        "\n\nALGORITHM SETTINGS:\n"
        f"Policy: {agent.policy_type}, Automatic temperature tuning: {agent.automatic_temperature_tuning}\n"
        f"Gamma: {agent.gamma}, Tau: {agent.tau}, Alpha: {agent.alpha}, LR: {agent.lr}\n"
        f"Target update interval: {agent.target_update_interval}, Latent dim: {agent.input_dim}, Hidden size: {agent.hidden_size}"
    )
    with open(log_dir / "settings.txt", "w") as file:
        file.write(settings_msg)

    if load_models:
        try:
            agent.load_model(path_to_actor, path_to_critic)
        except FileNotFoundError:
            warnings.warn(
                "Couldn't locate models in the specified paths. Training from scratch.",
                RuntimeWarning)

    for i_episode in itertools.count(1):
        episode_reward = 0
        episode_steps = 0
        done = False
        state = env.reset()
        state = process_observation(state)
        state = encoder.sample(state)
        # choose random starting position for the car
        position = np.random.randint(len(env.track))
        env.car = Car(env.world, *env.track[position][1:4])

        if accelerated_exploration:
            # choose random starting position for the car
            # position = np.random.randint(len(env.track))
            # env.car = Car(env.world, *env.track[position][1:4])
            # Sample random action
            action = env.action_space.sample()

        while not done:
            if total_numsteps < start_steps and not load_models:
                # sample action with acceleration bias if accelerated_action = True
                if accelerated_exploration:
                    action = generate_action(action)
                else:
                    action = env.action_space.sample()
            else:
                action = agent.select_action(state)

            if len(memory) > batch_size:
                # Number of updates per step in environment
                for _ in range(updates_per_step):
                    # Update parameters of all the networks
                    critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(
                        memory, batch_size, updates)
                    writer.add_scalar('loss/critic_1', critic_1_loss, updates)
                    writer.add_scalar('loss/critic_2', critic_2_loss, updates)
                    writer.add_scalar('loss/policy', policy_loss, updates)
                    writer.add_scalar('loss/entropy_loss', ent_loss, updates)
                    writer.add_scalar('entropy_temperature/alpha', alpha,
                                      updates)
                    updates += 1

            next_state, reward, done, _ = env.step(action)  # Step
            next_state = process_observation(next_state)
            next_state = encoder.sample(next_state)
            episode_steps += 1
            total_numsteps += 1
            episode_reward += reward

            # Ignore the "done" signal if it comes from hitting the time horizon.
            # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
            mask = 1 if episode_steps == env._max_episode_steps else float(
                not done)

            memory.push(state, action, reward, next_state,
                        mask)  # Append transition to memory

            state = next_state

        if total_numsteps > num_steps:
            break

        writer.add_scalar('reward/train', episode_reward, i_episode)

        print(
            f"Episode: {i_episode}, total numsteps: {total_numsteps}, episode steps: {episode_steps}, reward: {round(episode_reward, 2)}"
        )

        if i_episode % eval_interval == 0 and eval == True:
            avg_reward = 0.
            episodes = 10

            if save_models:
                agent.save_model(
                    "carracer",
                    f"{getuser()}_{date.month}_{date.day}_{date.hour}")

            for _ in range(episodes):
                state = env.reset()
                state = process_observation(state)
                state = encoder.sample(state)

                episode_reward = 0
                done = False
                while not done:
                    action = agent.select_action(state, eval=True)

                    next_state, reward, done, _ = env.step(action)
                    next_state = process_observation(next_state)
                    next_state = encoder.sample(next_state)
                    episode_reward += reward

                    state = next_state
                avg_reward += episode_reward
            avg_reward /= episodes

            if save_models:
                agent.save_model(
                    "carracer",
                    f"{getuser()}_{date.month}_{date.day}_{date.hour}")
            if save_memory:
                memory.save(
                    f"buffer_{getuser()}_{date.month}_{date.day}_{date.hour}")

            writer.add_scalar("avg_reward/test", avg_reward, i_episode)

            print("-" * 40)
            print(
                f"Test Episodes: {episodes}, Avg. Reward: {round(avg_reward, 2)}"
            )
            print("-" * 40)

    env.close()
示例#6
0
def main(args=None):
    if args is None:
        args = readParser()

    save_model_dir = os.path.join(args.save_dir, args.env_name,
                                  'dynamics_model')
    save_policy_dir = os.path.join(args.save_dir, args.env_name,
                                   'policy_network')
    save_env_buffer_dir = os.path.join(args.save_dir, args.env_name,
                                       'env_buffer')
    save_dynamics_buffer_dir = os.path.join(args.save_dir, args.env_name,
                                            'dynamics_buffer')
    if not os.path.exists(save_model_dir):
        os.makedirs(save_model_dir)
    if not os.path.exists(save_policy_dir):
        os.makedirs(save_policy_dir)
    if not os.path.exists(save_env_buffer_dir):
        os.makedirs(save_env_buffer_dir)
    if not os.path.exists(save_dynamics_buffer_dir):
        os.makedirs(save_dynamics_buffer_dir)

    # Initial environment
    if 'Ant' in args.env_name:
        args.env_name = new_env.register_mbpo_environments()[0]
        print('Loaded TruncatedObs-version of the Ant environment: {}'.format(
            args.env_name))
    # else:
    #     env_name = args.env_name
    env = gym.make(args.env_name)

    job_name = 'MBPO_test_policy_dependent_models_{}_{}_{}'.format(
        args.env_name, args.model_type, args.seed)
    writer = SummaryWriter(
        str(os.path.join(args.save_dir, 'tensorboard', job_name)))
    writer.add_text(
        'hyperparameters', "|param|value|\n|-|-|\n%s" %
        ('\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ReplayMemory(args.replay_size)
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ReplayMemory(new_pool_size)

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agent, env_pool, model_pool, writer,
          save_model_dir, save_policy_dir, save_env_buffer_dir,
          save_dynamics_buffer_dir)

    print('Training complete!')
    print(
        '---------------------------------------------------------------------'
    )
    print(
        'Start evaluating different policies at different model checkpoints...'
    )
    print(
        '---------------------------------------------------------------------'
    )
    test_policy_dependent_models(args, env, state_size, action_size,
                                 args.save_model_freq,
                                 args.save_model_freq * 6, save_model_dir,
                                 save_policy_dir)
示例#7
0
o_actor = root + runs[player2] + models2[0]
o_critic = root + runs[player2] + models2[1]
o_target = root + runs[player2] + models2[2] if len(models2) == 3 else None

agent.load_model(actor, critic, target)
opponent.load_model(o_actor, o_critic, o_target)

basic = h_env.BasicOpponent(weak=False)

time_ = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
#Tesnorboard
# writer = SummaryWriter(f"hockey-runs-defence/{time_}_batch_size-{args.batch_size}_gamma-{args.gamma}_tau-{args.tau}_lr-{args.lr}_alpha-{args.alpha}_tuning-{args.automatic_entropy_tuning}_hidden_size-{args.hidden_size}_updatesStep-{args.updates_per_step}_startSteps-{args.start_steps}_targetIntervall-{args.target_update_interval}_replaysize-{args.replay_size}")

# Memory
# memory = PrioritizedReplay(args.replay_size)
memory = ReplayMemory(args.replay_size, args.seed)

# Training Loop
total_numsteps = 0
updates = 0

o = env.reset()
# _ = env.render()

score_we = 0
score_they = 0

avg_reward = 0.
episodes = 1000
for i_episode in range(episodes):
    state = env.reset()
示例#8
0
def test():
    # Environment
    # env = NormalizedActions(gym.make(args.env_name))
    #env = gym.make(args.env_name)
    args = get_args()
    args.eval = True
    set_env_arg(t_type=args.t_type,
                n_type=args.n_type,
                r_type=args.r_type,
                proj=str_to_bool(args.proj),
                cam_r_noise=str_to_bool(args.cam_r_noise),
                cam_t_noise=str_to_bool(args.cam_t_noise),
                cam_in_noise=str_to_bool(args.cam_in_noise),
                test=str_to_bool(args.test))

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    # env.seed(args.seed)

    # Agent
    agent = SAC(env.state_dim, env.action_space, args)
    agent.load_model('models/sac_actor_crane_', 'models/sac_critic_crane_')

    # TesnorboardX
    writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
        args.policy, "autotune" if args.automatic_entropy_tuning else ""))

    # Memory
    memory = ReplayMemory(args.replay_size)

    # Training Loop
    total_numsteps = 0
    updates = 0

    for ep in range(MAX_EP_STEPS):
        state, gt = env.reset()
        episode_reward = 0

        for t in range(MAX_STEP):
            # while True:
            env.render()

            action = agent.select_action(state)

            next_state, reward, done, _ = env.step(action)  # Step
            if done:
                mask = 1
            else:
                mask = 0
            memory.push(state, action, reward, next_state,
                        mask)  # Append transition to memory
            """# store experience
                    trans = np.hstack((s, a, [r], s_))
                    outfile = exp_path + '/' + str(ep) + '_' + str(t)
                    np.save(outfile, trans)
                    """

            state = next_state

            episode_reward += reward

            if t == MAX_STEP - 1 or done:

                # if done:
                result = '| done' if done else '| ----'
                print(
                    'Ep:',
                    ep,
                    result,
                    '| R: %i' % int(episode_reward),
                    '| Explore: %.2f' % var,
                )

                out_s = 'Ep: ' + str(ep) + ' result: ' + str(done) + \
                    " R: " + str(episode_reward) + " Explore " + str(var) + " \n"
                """
示例#9
0
def train():
    # Environment
    # env = NormalizedActions(gym.make(args.env_name))
    #env = gym.make(args.env_name)
    global var
    args = get_args()
    set_env_arg(t_type=args.t_type,
                n_type=args.n_type,
                r_type=args.r_type,
                proj=str_to_bool(args.proj),
                cam_r_noise=str_to_bool(args.cam_r_noise),
                cam_t_noise=str_to_bool(args.cam_t_noise),
                cam_in_noise=str_to_bool(args.cam_in_noise),
                test=str_to_bool(args.test))

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    # env.seed(args.seed)

    # Agent
    agent = SAC(env.state_dim, env.action_space, args)
    agent.load_model('models/sac_actor_crane70_', 'models/sac_critic_crane70_')

    # TesnorboardX
    writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
        args.policy, "autotune" if args.automatic_entropy_tuning else ""))

    # Memory
    memory = ReplayMemory(args.replay_size)

    # Training Loop
    total_numsteps = 0
    updates = 0

    for ep in range(MAX_EP_STEPS):
        state, gt = env.reset()
        episode_reward = 0

        for t in range(MAX_STEP):
            # while True:
            env.render()

            # Added exploration noise
            if ep < sample_numsteps:
                print('sample')
                action = env.action_space.sample()  # Sample random action
            else:
                # Sample action from policy
                action = agent.select_action(state)

            # add randomness to action selection for exploration
            action = np.clip(np.random.normal(action, var), *ACTION_BOUND)
            next_state, reward, done, _ = env.step(action)  # Step
            if done:
                mask = 1
            else:
                mask = 0
            memory.push(state, action, reward, next_state,
                        mask)  # Append transition to memory
            """# store experience
                    trans = np.hstack((s, a, [r], s_))
                    outfile = exp_path + '/' + str(ep) + '_' + str(t)
                    np.save(outfile, trans)
                    """

            if len(memory) > sample_numsteps * MAX_STEP:
                # Number of updates per step in environment
                var = max([var * .9999, VAR_MIN])
                for i in range(1):
                    # Update parameters of all the networks
                    critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(
                        memory, 512, updates)

                    writer.add_scalar('loss/critic_1', critic_1_loss, updates)
                    writer.add_scalar('loss/critic_2', critic_2_loss, updates)
                    writer.add_scalar('loss/policy', policy_loss, updates)
                    writer.add_scalar('loss/entropy_loss', ent_loss, updates)
                    writer.add_scalar('entropy_temprature/alpha', alpha,
                                      updates)
                    updates += 1

            state = next_state

            episode_reward += reward

            if t == MAX_STEP - 1 or done:
                if len(memory) > sample_numsteps * MAX_STEP:
                    for i in range(10):
                        # Update parameters of all the networks
                        critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(
                            memory, 512, updates)

                        writer.add_scalar('loss/critic_1', critic_1_loss,
                                          updates)
                        writer.add_scalar('loss/critic_2', critic_2_loss,
                                          updates)
                        writer.add_scalar('loss/policy', policy_loss, updates)
                        writer.add_scalar('loss/entropy_loss', ent_loss,
                                          updates)
                        writer.add_scalar('entropy_temprature/alpha', alpha,
                                          updates)
                        updates += 1

                # if done:
                result = '| done' if done else '| ----'
                print(
                    'Ep:',
                    ep,
                    result,
                    '| R: %i' % int(episode_reward),
                    '| Explore: %.2f' % var,
                )

                out_s = 'Ep: ' + str(ep) + ' result: ' + str(done) + \
                    " R: " + str(episode_reward) + " Explore " + str(var) + " \n"
                break
                """
                    f = open(log_path, "a+")
                    f.write(out_s)
                    f.close()
                    """
            if ep % 10 == 0:
                agent.save_model(env_name='crane' + str(ep))

    agent.save_model(env_name='crane')
示例#10
0
def main(args=None):
    if args is None:
        args = readParser()

    # if not os.path.exists(args.save_model_path):
    #     os.makedirs(args.save_model_path)
    # if not os.path.exists(args.save_policy_path):
    #     os.makedirs(args.save_policy_path)

    # Initial environment
    env = gym.make(args.env_name)

    # job_name = 'MBPO_test_policy_dependent_models_{}_{}_{}'.format(args.env_name, args.model_type, args.seed)
    # writer = SummaryWriter("test_policy_dependent_results_2/tensorboard/{}".format(job_name))
    # writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % (
    #     '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)
    policy_network_checkpoint = torch.load(
        './test_policy_dependent_results_2/policy/PolicyNetwork_20.pt')
    agent.policy.load_state_dict(
        policy_network_checkpoint['policy_model_state_dict'])

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)
    dynamics_model_checkpoint = torch.load(
        './test_policy_dependent_results_2/dynamics_model/EnsembleDynamicsModel_20.pt'
    )
    env_model.ensemble_model.load_state_dict(
        dynamics_model_checkpoint['dynamics_model_state_dict'])

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ReplayMemory(args.replay_size)
    env_pool.load(
        './test_policy_dependent_results_2/env_buffer/env_buffer_20.pkl')
    env_pool.position = len(env_pool.buffer)
    # env_pool.buffer = np.array(env_pool.buffer)[~np.where(np.array(env_pool.buffer)==None)[0]]
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ReplayMemory(new_pool_size)
    model_pool.load(
        './test_policy_dependent_results_2/model_buffer/model_buffer_20.pkl')
    model_pool.position = len(model_pool.buffer)
    # model_pool.buffer = np.array(model_pool.buffer)[~np.where(np.array(model_pool.buffer)==None)[0]]

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agent, env_pool, model_pool)
示例#11
0
# Environment
# env = NormalizedActions(gym.make(args.env_name))
env = gym.make(args.env_name)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
env.seed(args.seed)

# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)

#TesnorboardX
writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
                                                             args.policy, "autotune" if args.automatic_entropy_tuning else ""))

# Memory
memory = ReplayMemory(args.replay_size)

# Training Loop
total_numsteps = 0
updates = 0

for i_episode in itertools.count(1):
    episode_reward = 0
    episode_steps = 0
    done = False
    state = env.reset()

    while not done:
        if args.start_steps > total_numsteps:
            action = env.action_space.sample()  # Sample random action
        else: