Exemplo n.º 1
0
def train_test_agents(mode, env, real_env, config):
    rewards = []
    episode_lengths = []

    # settings for comparability
    config['agents']['td3']['test_episodes'] = 1
    config['agents']['td3']['train_episodes'] = 500

    config['agents']['td3']['activation_fn'] = "relu"
    config['agents']['td3']['lr'] = 0.001
    config['agents']['td3']['tau'] = 0.005
    config['agents']['td3']['same_action_num'] = 2
    config['agents']['td3']['policy_delay'] = 2
    config['agents']['td3']['rb_size'] = 10000000
    # config['agents']['td3']['policy_std_clip'] = 0.9

    config['agents']['td3']['print_rate'] = 100

    for i in range(MODEL_AGENTS):
        if mode == '-1':
            agent = select_agent(config=config, agent_name='td3_icm')
        else:
            agent = select_agent(config=config, agent_name='td3')
        reward, episode_length, _ = agent.train(env=env, test_env=real_env)
        print('reward: ' + str(reward))
        rewards.append(reward)
        episode_lengths.append(episode_length)
    return rewards, episode_lengths
Exemplo n.º 2
0
def train_test_agents(mode, env, real_env, config):
    rewards = []
    episode_lengths = []

    # settings for comparability
    config['agents']['duelingddqn'] = {}
    config['agents']['duelingddqn']['test_episodes'] = 1
    config['agents']['duelingddqn']['train_episodes'] = 1000
    config['agents']['duelingddqn']['print_rate'] = 100

    config['agents']['duelingddqn']['lr'] = 0.00025
    config['agents']['duelingddqn']['eps_init'] = 1.0
    config['agents']['duelingddqn']['eps_min'] = 0.1
    config['agents']['duelingddqn'][
        'eps_decay'] = 0.9  # original DDQN paper uses linear decay over 1M N's
    config['agents']['duelingddqn']['gamma'] = 0.99
    config['agents']['duelingddqn']['batch_size'] = 32
    config['agents']['duelingddqn']['same_action_num'] = 1
    config['agents']['duelingddqn']['activation_fn'] = "relu"
    config['agents']['duelingddqn'][
        'tau'] = 0.01  # original DDQN paper has hard update every N steps
    config['agents']['duelingddqn']['hidden_size'] = 64
    config['agents']['duelingddqn']['hidden_layer'] = 1
    config['agents']['duelingddqn']['rb_size'] = 1000000
    config['agents']['duelingddqn']['init_episodes'] = 1
    config['agents']['duelingddqn']['feature_dim'] = 128

    config['agents']['duelingddqn']['early_out_num'] = 10
    config['agents']['duelingddqn']['early_out_virtual_diff'] = 0.02

    # optimized ICM HPs:
    config['agents']['icm'] = {}
    config['agents']['icm']['beta'] = 0.05
    config['agents']['icm']['eta'] = 0.03
    config['agents']['icm']['feature_dim'] = 32
    config['agents']['icm']['hidden_size'] = 128
    config['agents']['icm']['lr'] = 1e-5

    # default ICM HPs:
    # config['agents']['icm'] = {}
    # config['agents']['icm']['beta'] = 0.2
    # config['agents']['icm']['eta'] = 0.5
    # config['agents']['icm']['feature_dim'] = 64
    # config['agents']['icm']['hidden_size'] = 128
    # config['agents']['icm']['lr'] = 1e-4

    for i in range(MODEL_AGENTS):
        if mode == '-1':
            agent = select_agent(config=config, agent_name='duelingddqn_icm')
        else:
            agent = select_agent(config=config, agent_name='duelingddqn')
        reward, episode_length, _ = agent.train(env=env, test_env=real_env)
        rewards.append(reward)
        episode_lengths.append(episode_length)
    return rewards, episode_lengths
Exemplo n.º 3
0
def train_test_agents(mode, env, real_env, config):
    rewards = []
    episode_lengths = []

    # settings for comparability
    config['agents']['ppo'] = {}
    config['agents']['ppo']['test_episodes'] = 1
    config['agents']['ppo']['train_episodes'] = 5000
    config['agents']['ppo']['print_rate'] = 100

    config['agents']['ppo']['init_episodes'] = 0
    config['agents']['ppo']['update_episodes'] = 1
    config['agents']['ppo']['ppo_epochs'] = 10
    config['agents']['ppo']['gamma'] = 0.99
    # config['agents']['ppo']['lr'] = 3e-4
    config['agents']['ppo']['lr'] = 1e-5
    config['agents']['ppo']['vf_coef'] = 1
    config['agents']['ppo']['ent_coef'] = 0.001
    config['agents']['ppo']['eps_clip'] = 0.2
    config['agents']['ppo']['rb_size'] = 1000000
    config['agents']['ppo']['same_action_num'] = 1
    config['agents']['ppo']['activation_fn'] = 'tanh'
    config['agents']['ppo']['hidden_size'] = 128
    config['agents']['ppo']['hidden_layer'] = 2
    config['agents']['ppo']['action_std'] = 0.1

    config['agents']['ppo']['early_out_num'] = 50
    config['agents']['ppo']['early_out_virtual_diff'] = 0.02

    # BOHB optimized HPs
    config['agents']['icm'] = {}
    config['agents']['icm']['beta'] = 0.05
    config['agents']['icm']['eta'] = 0.03
    config['agents']['icm']['feature_dim'] = 32
    config['agents']['icm']['hidden_size'] = 128
    config['agents']['icm']['lr'] = 1e-4

    # default ICM HPs:
    # config['agents']['icm'] = {}
    # config['agents']['icm']['beta'] = 0.2
    # config['agents']['icm']['eta'] = 0.5
    # config['agents']['icm']['feature_dim'] = 64
    # config['agents']['icm']['hidden_size'] = 128
    # config['agents']['icm']['lr'] = 1e-4

    for i in range(MODEL_AGENTS):
        if mode == '-1':
            agent = select_agent(config=config, agent_name='ppo_icm')
        else:
            agent = select_agent(config=config, agent_name='ppo')
        reward, episode_length, _ = agent.train(env=env, test_env=real_env)
        print('reward: ' + str(reward))
        rewards.append(reward)
        episode_lengths.append(episode_length)
    return rewards, episode_lengths
Exemplo n.º 4
0
def train_test_agents(mode, env, real_env, config):
    rewards = []
    episode_lengths = []

    # settings for comparability
    config['agents']['td3']['test_episodes'] = 1
    config['agents']['td3']['train_episodes'] = 3000
    config['agents']['td3']['print_rate'] = 100

    config['agents']['td3']['lr'] = 3e-4
    config['agents']['td3']['tau'] = 0.005
    config['agents']['td3']['activation_fn'] = 'relu'
    config['agents']['td3']['same_action_num'] = 2
    config['agents']['td3']['policy_delay'] = 2
    config['agents']['td3']['policy_std_clip'] = 0.5
    config['agents']['td3']['policy_std'] = 0.2
    config['agents']['td3']['action_std'] = 0.1
    config['agents']['td3']['batch_size'] = 256
    config['agents']['td3']['gamma'] = 0.99
    config['agents']['td3']['rb_size'] = 1000000
    config['agents']['td3'][
        'init_episodes'] = 50  # set via time steps in original td3 implementation

    config['agents']['td3']['early_out_num'] = 10
    config['agents']['td3']['early_out_virtual_diff'] = 1e-2

    # optimized ICM HPs:
    config['agents']['icm'] = {}
    config['agents']['icm']['beta'] = 0.1
    config['agents']['icm']['eta'] = 0.01
    config['agents']['icm']['feature_dim'] = 32
    config['agents']['icm']['hidden_size'] = 128
    config['agents']['icm']['lr'] = 5e-4

    # default ICM HPs:
    # config['agents']['icm'] = {}
    # config['agents']['icm']['beta'] = 0.2
    # config['agents']['icm']['eta'] = 0.5
    # config['agents']['icm']['feature_dim'] = 64
    # config['agents']['icm']['hidden_size'] = 128
    # config['agents']['icm']['lr'] = 1e-4

    for i in range(MODEL_AGENTS):
        config_mod = vary_hp(config)

        if mode == '-1':
            agent = select_agent(config=config_mod, agent_name='td3_icm')
        else:
            agent = select_agent(config=config_mod, agent_name='td3')
        reward, episode_length, _ = agent.train(env=env, test_env=real_env)
        print('reward: ' + str(reward))
        rewards.append(reward)
        episode_lengths.append(episode_length)
    return rewards, episode_lengths
def train_test_agents(env, real_env, config):
    states = []

    # settings for comparability
    config['agents']['sarsa'] = {}
    config['agents']['sarsa']['test_episodes'] = 1
    config['agents']['sarsa']['train_episodes'] = 200
    config['agents']['sarsa']['print_rate'] = 100
    config['agents']['sarsa']['init_episodes'] = config['agents']['ql']['init_episodes']
    config['agents']['sarsa']['batch_size'] = config['agents']['ql']['batch_size']
    config['agents']['sarsa']['alpha'] = config['agents']['ql']['alpha']
    config['agents']['sarsa']['gamma'] = config['agents']['ql']['gamma']
    config['agents']['sarsa']['eps_init'] = config['agents']['ql']['eps_init']
    config['agents']['sarsa']['eps_min'] = config['agents']['ql']['eps_min']
    config['agents']['sarsa']['eps_decay'] = config['agents']['ql']['eps_decay']
    config['agents']['sarsa']['rb_size'] = config['agents']['ql']['rb_size']
    config['agents']['sarsa']['same_action_num'] = config['agents']['ql']['same_action_num']
    config['agents']['sarsa']['early_out_num'] = config['agents']['ql']['early_out_num']
    config['agents']['sarsa']['early_out_virtual_diff'] = config['agents']['ql']['early_out_virtual_diff']

    for i in range(MODEL_AGENTS):
        agent = select_agent(config=config, agent_name='sarsa')
        reward, _, _ = agent.train(env=env, test_env=real_env)
        _, _, replay_buffer = agent.test(env=real_env)
        state, _, next_state, _, _ = replay_buffer.get_all()
        state = state.tolist()
        next_state = next_state.tolist()
        # skip if we could not solve env
        if len(reward) == config['agents']['sarsa']['train_episodes'] and BREAK == 'solved':
            continue
        state = [int(elem[0]) for elem in state]
        state.append(int(next_state[-1][0]))
        states.append(state)

    return states
Exemplo n.º 6
0
    def calc_score(self, env, time_remaining):
        time_start = time.time()

        agent = select_agent(config=self.config, agent_name=self.agent_name)
        real_env = self.env_factory.generate_real_env()

        reward_list_train, episode_length_train, _ = agent.train(
            env=env,
            test_env=real_env,
            time_remaining=time_remaining - (time.time() - time_start))
        reward_list_test, _, _ = agent.test(env=real_env,
                                            time_remaining=time_remaining -
                                            (time.time() - time_start))
        avg_reward_test = statistics.mean(reward_list_test)

        if env.is_virtual_env():
            return avg_reward_test
        else:
            # # when timeout occurs, reward_list_train is padded (with min. reward values) and episode_length_train is not
            # if len(episode_length_train) < len(reward_list_train):
            #     print("due to timeout, reward_list_train has been padded")
            #     print(f"shape rewards: {np.shape(reward_list_train)}, shape episode lengths: {np.shape(episode_length_train)}")
            #     reward_list_train = reward_list_train[:len(episode_length_train)]

            print("AVG REWARD: ", avg_reward_test)
            return avg_reward_test
Exemplo n.º 7
0
def train_test_agents(mode, env, real_env, config):
    rewards = []

    # settings for comparability
    config['agents']['td3']['test_episodes'] = 1
    config['agents']['td3']['train_episodes'] = 50
    config['agents']['td3']['print_rate'] = 1

    for i in range(MODEL_AGENTS):
        if mode == '-1':
            agent = select_agent(config=config, agent_name='td3_icm')
        else:
            agent = select_agent(config=config, agent_name='td3')
        reward, _, _ = agent.train(env=env, test_env=real_env)
        print('reward: ' + str(reward))
        rewards.append(reward)

    return rewards
Exemplo n.º 8
0
def train_test_agents(mode, env, real_env, config):
    rewards = []
    episode_lengths = []

    # settings for comparability
    config['agents']['sarsa'] = {}
    config['agents']['sarsa']['test_episodes'] = 1
    config['agents']['sarsa']['train_episodes'] = 500
    config['agents']['sarsa']['print_rate'] = 100

    config['agents']['sarsa']['alpha'] = 1.0
    config['agents']['sarsa']['eps_decay'] = 0.0
    config['agents']['sarsa']['eps_init'] = 0.01  # 0.01
    config['agents']['sarsa']['eps_min'] = 0.01  # 0.01
    config['agents']['sarsa']['gamma'] = 0.8
    config['agents']['sarsa']['same_action_num'] = 1
    config['agents']['sarsa'][
        'rb_size'] = 1  # custom to reward env and gridworld
    config['agents']['sarsa']['init_episodes'] = 0
    config['agents']['sarsa']['batch_size'] = 1

    config['agents']['sarsa']['early_out_num'] = 10
    config['agents']['sarsa']['early_out_virtual_diff'] = 0.02

    # for count-based q-learning
    config['agents']['sarsa']['beta'] = 0.1

    # for count-based q-learning (tuned)
    # config['agents']['sarsa']['beta'] = 0.005  # 0.01 also works fine

    for i in range(MODEL_AGENTS):
        if mode == '-1':
            agent = select_agent(config=config, agent_name='sarsa_cb')
        else:
            agent = select_agent(config=config, agent_name='sarsa')
        reward, episode_length, _ = agent.train(env=env, test_env=real_env)
        rewards.append(reward)
        episode_lengths.append(episode_length)
    return rewards, episode_lengths
def train_test_agents(train_env, test_env, config):
    reward_list = []
    train_steps_needed = []
    episodes_needed = []

    config["agents"]["ddqn"]["print_rate"] = 10
    config["agents"]["ddqn"]["test_episodes"] = 10
    config["render_env"] = True

    agent = select_agent(config=config, agent_name='DDQN')
    reward_train, episode_length, _ = agent.train(env=train_env)
    reward, _, _ = agent.test(env=test_env)
    print('reward: ' + str(reward))
    reward_list.append(reward)
    train_steps_needed.append([sum(episode_length)])
    episodes_needed.append([(len(reward_train))])

    return reward_list, train_steps_needed, episodes_needed
Exemplo n.º 10
0
    def __init__(self, config):
        super().__init__()

        self.config = config

        reptile_config = config["agents"]["reptile"]
        self.max_iterations = reptile_config["max_iterations"]
        self.step_size = reptile_config["step_size"]
        self.parallel_update = reptile_config["parallel_update"]
        self.env_num = reptile_config["env_num"]

        agent_name = reptile_config["agent_name"]
        self.env_factory = EnvFactory(config)
        self.agent = select_agent(config, agent_name)

        self.envs = []
        for i in range(self.env_num):
            self.envs.append(self.env_factory.generate_random_real_env())
def train_test_agents(mode, env, real_env, config):
    rewards = []
    episode_lengths = []

    config['device'] = 'cuda'
    config['agents']["td3_discrete_vary"]["print_rate"] = 100
    config['agents']["td3_discrete_vary"]["train_episodes"] = 500
    config['agents']["td3_discrete_vary"]["test_episodes"] = 1
    config['envs']['CartPole-v0'][
        'solved_reward'] = 100000  # something big enough to prevent early out triggering

    for i in range(MODEL_AGENTS):
        agent = select_agent(config=config, agent_name='td3_discrete_vary')
        reward, episode_length, _ = agent.train(env=env, test_env=real_env)
        print('reward: ' + str(reward))
        rewards.append(sum(reward))
        episode_lengths.append(episode_length)
    return rewards, episode_lengths
def train_test_agents(train_env, test_env, config, agents_num):
    reward_list = []
    train_steps_needed = []
    episodes_needed = []

    # settings for comparability
    config['agents']['duelingddqn_vary']['vary_hp'] = True
    config['agents']['duelingddqn']['print_rate'] = 10
    config['agents']['duelingddqn']['early_out_num'] = 10
    config['agents']['duelingddqn']['train_episodes'] = 1000
    config['agents']['duelingddqn']['init_episodes'] = 10
    config['agents']['duelingddqn']['test_episodes'] = 10
    config['agents']['duelingddqn']['early_out_virtual_diff'] = 0.01

    for i in range(agents_num):
        agent = select_agent(config=config, agent_name='DuelingDDQN_vary')
        reward_train, episode_length, _ = agent.train(env=train_env)
        reward, _, _ = agent.test(env=test_env)
        print('reward: ' + str(reward))
        reward_list.append(reward)
        train_steps_needed.append([sum(episode_length)])
        episodes_needed.append([(len(reward_train))])

    return reward_list, train_steps_needed, episodes_needed
                              device=device)

    step_times_per_episode_real_env = {}
    step_times_per_episode_syn_env = {}

    for i, file_name in enumerate(file_list):
        syn_env, real_env, config = load_envs_and_config(file_name=file_name,
                                                         model_dir=model_dir,
                                                         device=device)

        config["agents"][agent_name]["init_episodes"] = init_episodes
        config["agents"][agent_name]["train_episodes"] = train_episodes
        config['agents'][agent_name]['print_rate'] = 10

        print('train agents on ' + str(file_name))
        agent = select_agent(config=config, agent_name=agent_name)

        print('train on real env')
        _, _, _, step_times_real_env = agent.train(
            env=real_env, time_remaining=time_remaining_real)
        step_times_per_episode_real_env[
            real_env.env.env_name + "_" + str(i)] = {
                "step_times_per_episode_real_env": step_times_real_env,
                "step_times_mean":
                np.mean(np.concatenate(step_times_real_env)),
                "step_times_std": np.std(np.concatenate(step_times_real_env))
            }

        print('train on syn env')
        _, _, _, step_times_syn_env = agent.train(
            env=syn_env, time_remaining=time_remaining_syn)