Пример #1
0
def main(args):
    hidden_units = args.hidden_units
    msg_dim = args.msg_dim
    model_path = os.getcwd() + "/" + args.model_dir

    ray.init(log_to_driver=False)

    env_test_instance = gym.make('BipedalWalker-v3')

    if args.baseline:
        from smp.baseline import TD3Net
        action_dimension = copy(env_test_instance.action_space.shape[0])
    else:
        from smp.smp import TD3Net
        action_dimension = 1

    model_kwargs = {
        # action dimension for modular actions
        'action_dimension': action_dimension,
        'min_action': copy(env_test_instance.action_space.low)[0],
        'max_action': copy(env_test_instance.action_space.high)[0],
        'msg_dimension': msg_dim,
        'fix_sigma': True,
        'hidden_units': hidden_units
    }
    del env_test_instance

    manager = SampleManager(TD3Net,
                            'BipedalWalker-v3',
                            num_parallel=(os.cpu_count() - 1),
                            total_steps=150,
                            action_sampling_type="continuous_normal_diagonal",
                            is_tf=True,
                            model_kwargs=model_kwargs)

    manager.load_model(model_path)
    manager.test(200, test_episodes=5, render=True)

    ray.shutdown()
Пример #2
0
        'environment': GridWorld,
        'env_kwargs': env_kwargs,
        'num_parallel': 8,
        'total_steps': 100,
        'action_sampling_type': 'epsilon_greedy',
        'model_kwargs': model_kwargs,
        'input_shape':
        False,  # no input shape needed for getting first weights
        'weights': weights,
        'num_episodes': 10,
        'epsilon': 0.8,
        'is_tf': False
    }

    ray.init(log_to_driver=False)
    manager = SampleManager(**kwargs)

    saving_path = os.getcwd() + '/progress_tabq'
    saving_after = 5

    # prameters for optimization
    buffer_size = 500
    test_steps = 100
    epochs = 10
    sample_size = 500  # training steps per epoch
    # discount
    gamma = 0.95
    learning_rate = 0.2

    # keys needed for tabular q
    optim_keys = ['state', 'action', 'reward', 'state_new', 'not_done']
Пример #3
0
        return output

    def get_config(self):
        return super(PPONet, self).get_config()


if __name__ == "__main__":

    # initialize
    ray.init(log_to_driver=False)
    manager = SampleManager(
        PPONet,
        'LunarLanderContinuous-v2',
        num_parallel=3,
        total_steps=150,
        action_sampling_type="continuous_normal_diagonal",
        #todo check if monte carlo is correct
        #todo what about gamma??
        returns=['monte_carlo', 'value_estimate', 'log_prob'])

    epochs = 30
    saving_path = os.getcwd() + "/hw3_results"
    saving_after = 5
    sample_size = 150
    optim_batch_size = 8
    gamma = .99
    test_steps = 1000
    # Factor of how much the new policy is allowed to differ from the old one
    epsilon = 0.2
    entropy_weight = 0.01
Пример #4
0
    env = GridWorld(**env_kwargs)

    model_kwargs = {"h": env.height, "w": env.width, "action_space": 4}

    kwargs = {
        "model": TabularQ,
        "environment": GridWorld,
        "num_parallel": 2,
        "total_steps": 20,
        "model_kwargs": model_kwargs,
        "env_kwargs": env_kwargs
    }

    # initializing ray
    ray.init(log_to_driver=False)
    manager = SampleManager(**kwargs)

    saving_path = os.getcwd() + "/progress_test"

    epochs = 10
    buffer_size = 5000
    test_steps = 50
    sample_size = 1000
    saving_after = 5

    alpha = 0.1
    gamma = 0.95

    optim_keys = ["state", "action", "reward", "state_new", "not_done"]

    # initialize buffer
Пример #5
0
def train_td3(args, model, action_dimension=None):

    print(args)

    tf.keras.backend.set_floatx('float32')

    ray.init(log_to_driver=False)

    # hyper parameters
    buffer_size = args.buffer_size  # 10e6 in their repo, not possible with our ram
    epochs = args.epochs
    saving_path = os.getcwd() + "/" + args.saving_dir
    saving_after = 5
    sample_size = args.sample_size
    optim_batch_size = args.batch_size
    gamma = args.gamma
    test_steps = 100  # 1000 in their repo
    policy_delay = 2
    rho = .046
    policy_noise = args.policy_noise
    policy_noise_clip = .5
    msg_dim = args.msg_dim  # 32 in their repo
    learning_rate = args.learning_rate

    save_args(args, saving_path)

    env_test_instance = gym.make('BipedalWalker-v3')
    if action_dimension is None:
        action_dimension = copy(env_test_instance.action_space.shape[0])
    model_kwargs = {
        # action dimension for modular actions
        'action_dimension': action_dimension,
        'min_action': copy(env_test_instance.action_space.low)[0],
        'max_action': copy(env_test_instance.action_space.high)[0],
        'msg_dimension': msg_dim,
        'fix_sigma': True,
        'hidden_units': args.hidden_units
    }
    del env_test_instance

    manager = SampleManager(model,
                            'BipedalWalker-v3',
                            num_parallel=(os.cpu_count() - 1),
                            total_steps=150,
                            action_sampling_type="continuous_normal_diagonal",
                            is_tf=True,
                            model_kwargs=model_kwargs)

    optim_keys = [
        'state',
        'action',
        'reward',
        'state_new',
        'not_done',
    ]

    manager.initialize_buffer(buffer_size, optim_keys)

    manager.initialize_aggregator(path=saving_path,
                                  saving_after=saving_after,
                                  aggregator_keys=["loss", "reward"])

    agent = manager.get_agent()

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    # fill buffer
    print("Filling buffer before training..")
    while len(manager.buffer.buffer[
            manager.buffer.keys[0]]) < manager.buffer.size:
        # Gives you state action reward trajectories
        data = manager.get_data()
        manager.store_in_buffer(data)

    # track time while training
    timer = time.time()
    last_t = timer

    target_agent = manager.get_agent()
    for e in range(epochs):
        # off policy
        sample_dict = manager.sample(sample_size, from_buffer=True)
        print(f"collected data for: {sample_dict.keys()}")

        # cast values to float32 and create data dict
        sample_dict['state'] = tf.cast(sample_dict['state'], tf.float32)
        sample_dict['action'] = tf.cast(sample_dict['action'], tf.float32)
        sample_dict['reward'] = tf.cast(sample_dict['reward'], tf.float32)
        sample_dict['state_new'] = tf.cast(sample_dict['state_new'],
                                           tf.float32)
        sample_dict['not_done'] = tf.cast(sample_dict['not_done'], tf.float32)
        data_dict = dict_to_dict_of_datasets(sample_dict,
                                             batch_size=optim_batch_size)

        total_loss = 0
        for state, action, reward, state_new, not_done in \
                zip(data_dict['state'],
                    data_dict['action'],
                    data_dict['reward'],
                    data_dict['state_new'],
                    data_dict['not_done']):

            action_new = target_agent.act(state_new)
            # add noise to action_new
            action_new = action_new + tf.clip_by_value(
                tf.random.normal(action_new.shape, 0., policy_noise),
                -policy_noise_clip, policy_noise_clip)
            # clip action_new to action space
            action_new = tf.clip_by_value(
                action_new, manager.env_instance.action_space.low,
                manager.env_instance.action_space.high)

            # calculate target with double-Q-learning
            state_action_new = tf.concat([state_new, action_new], axis=-1)
            q_values0 = target_agent.model.critic0(state_action_new)
            q_values1 = target_agent.model.critic1(state_action_new)
            q_values = tf.concat([q_values0, q_values1], axis=-1)
            q_targets = tf.squeeze(tf.reduce_min(q_values, axis=-1))
            critic_target = reward + gamma * not_done * q_targets

            state_action = tf.concat([state, action], axis=-1)

            # update critic 0
            with tf.GradientTape() as tape:
                q_output = agent.model.critic0(state_action)
                loss = tf.keras.losses.MSE(tf.squeeze(critic_target),
                                           tf.squeeze(q_output))

            total_loss += loss
            gradients = tape.gradient(loss,
                                      agent.model.critic0.trainable_variables)
            optimizer.apply_gradients(
                zip(gradients, agent.model.critic0.trainable_variables))

            # update critic 1
            with tf.GradientTape() as tape:
                q_output = agent.model.critic1(state_action)
                loss = tf.keras.losses.MSE(tf.squeeze(critic_target),
                                           tf.squeeze(q_output))

            total_loss += loss
            gradients = tape.gradient(loss,
                                      agent.model.critic1.trainable_variables)
            optimizer.apply_gradients(
                zip(gradients, agent.model.critic1.trainable_variables))

            # update actor with delayed policy update
            if e % policy_delay == 0:
                with tf.GradientTape() as tape:
                    actor_output = agent.model.actor(state)
                    action = reparam_action(actor_output,
                                            agent.model.action_dimension,
                                            agent.model.min_action,
                                            agent.model.max_action)
                    state_action = tf.concat([state, action], axis=-1)
                    q_val = agent.model.critic0(state_action)
                    actor_loss = -tf.reduce_mean(q_val)

                total_loss += actor_loss
                actor_gradients = tape.gradient(
                    actor_loss, agent.model.actor.trainable_variables)
                optimizer.apply_gradients(
                    zip(actor_gradients,
                        agent.model.actor.trainable_variables))

            # Update agent
            manager.set_agent(agent.get_weights())
            agent = manager.get_agent()

            if e % policy_delay == 0:
                # Polyak averaging
                new_weights = list(rho * np.array(target_agent.get_weights()) +
                                   (1. - rho) * np.array(agent.get_weights()))
                target_agent.set_weights(new_weights)

        reward = manager.test(test_steps, evaluation_measure="reward")
        manager.update_aggregator(loss=total_loss, reward=reward)
        print(
            f"epoch ::: {e}  loss ::: {total_loss}   avg reward ::: {np.mean(reward)}"
        )

        if e % saving_after == 0:
            manager.save_model(saving_path, e)

        # needed time and remaining time estimation
        current_t = time.time()
        time_needed = (current_t - last_t) / 60.
        time_remaining = (current_t - timer) / 60. / (e + 1) * (epochs -
                                                                (e + 1))
        print(
            'Finished epoch %d of %d. Needed %1.f min for this epoch. Estimated time remaining: %.1f min'
            % (e + 1, epochs, time_needed, time_remaining))
        last_t = current_t

    manager.load_model(saving_path)
    print("done")
    print("testing optimized agent")
    manager.test(test_steps, test_episodes=10, render=True)

    ray.shutdown()
    loss_function = tf.keras.losses.MSE
    num_episodes = 1

    kwargs = {
        "model": ActorCritic,
        "environment": "LunarLanderContinuous-v2",
        "num_parallel": 1,
        "total_steps": 1000,
        "action_sampling_type": "continuous_normal_diagonal",
        "num_steps": 1000,
        "returns": ['value_estimate', 'log_prob', 'monte_carlo']
    }

    ray.init(log_to_driver=False)

    manager = SampleManager(**kwargs)
    # where to save your results to: create this directory in advance!
    saving_path = os.getcwd() + "\progress_a2c"

    buffer_size = 1000  # not used
    test_steps = 500
    epochs = 20
    sample_size = 1000
    optim_batch_size = 1000
    saving_after = 10

    # keys for replay buffer needed for optimization
    optim_keys = [
        "state", "action", "reward", "state_new", "not_done", "monte_carlo"
    ]
Пример #7
0
    optimizer = tf.keras.optimizers.Adam(learning_rate=alpha)
    epsilon = 1

    kwargs = {
        "model": ModelContunous,
        "environment": "LunarLanderContinuous-v2",
        "num_parallel": 4,  # runner boxes
        "total_steps": 2000,  # amouint of maximal steps of each runner
        "action_sampling_type": "continuous_normal_diagonal",
        "num_episodes": 50,  # num_episodes per runner box
        "epsilon": epsilon,
    }

    ray.init(log_to_driver=False)

    manager = SampleManager(**kwargs)
    # where to save your results to: create this directory in advance!
    saving_path = os.getcwd() + "/progress_lunar"

    # keys for replay buffer -> what you will need for optimization
    optim_keys = ["state", "action", "reward", "state_new", "not_done"]

    # initialize buffer
    manager.initilize_buffer(buffer_size, optim_keys)

    # initilize progress aggregator
    manager.initialize_aggregator(path=saving_path,
                                  saving_after=5,
                                  aggregator_keys=["loss", "time_steps"])

    # initial testing:
Пример #8
0
    print('Prepare CartPole')
    env = gym.make("CartPole-v1")

    model_kwargs = {"layers": [16, 16, 16], "num_actions": env.action_space.n}

    kwargs = {
        "model": QNet,
        "environment": "CartPole-v1",
        "num_parallel": 1,
        "total_steps": 1000,
        "model_kwargs": model_kwargs,
    }

    # Initialize
    ray.init(log_to_driver=False)
    manager = SampleManager(**kwargs)

    # Where to load your results from
    loading_path = os.getcwd() + "/progress_CartPole"

    # Load model
    manager.load_model(loading_path)
    print("done")
    print("testing optimized agent")
    manager.test(
        1000,
        test_episodes=10,
        render=True,
        do_print=True,
        evaluation_measure="time_and_reward",
    )
Пример #9
0
    model_kwargs = {"h": env.height, "w": env.width, "action_space": 4}

    kwargs = {
        "model": TabularQ,
        "environment": GridWorld,
        "num_parallel": 2,
        "total_steps": 100,
        "model_kwargs": model_kwargs,
        "env_kwargs": env_kwargs
        # and more
    }

    # initilize
    ray.init(log_to_driver=False)
    manager = SampleManager(**kwargs)

    print("test before training: ")
    manager.test(
        max_steps=100,
        test_episodes=10,
        render=True,
        do_print=True,
        evaluation_measure="time_and_reward",
    )

    # some parameters
    epochs = 100
    gamma = 0.85
    learning_rate = 0.2
Пример #10
0
        hidden = self.d2(hidden)
        hidden = self.d3(hidden)
        q = self.dout(hidden)
        output["q_values"] = q
        return output


if __name__ == "__main__":

    tf.keras.backend.set_floatx('float64')

    # initialize
    ray.init(log_to_driver=False)
    manager = SampleManager(DQN,
                            'CartPole-v0',
                            num_parallel=3,
                            total_steps=100,
                            action_sampling_type="thompson")

    buffer_size = 2000
    epochs = 100
    saving_path = os.getcwd() + "/progress_dqn"
    saving_after = 5
    sample_size = 100
    optim_batch_size = 8
    gamma = .98
    update_interval = 4
    test_steps = 1000
    temperature = 1.5
    temperature_update = 0.98  #new_temp = old_temp*temp_update
    temperature_min = 0.5
Пример #11
0
    }

    kwargs = {
        "model": TabularQ,
        "environment": GridWorld,
        "num_parallel": 3,
        "total_steps": 2000,
        "model_kwargs": model_kwargs,
        "env_kwargs": env_kwargs,
        "num_episodes": 5
        # and more
    }

    # Initialize
    ray.init(log_to_driver=False)
    manager = SampleManager(**kwargs)

    print("test before training: ")
    manager.test(
        max_steps=100,
        test_episodes=1,
        render=True,
        do_print=True,
        evaluation_measure="time_and_reward",
    )

    # Do the rest!!!!

    # Where to save your results to: create this directory in advance!
    saving_path = os.getcwd() + "/progress_TabQ"
Пример #12
0
    kwargs = {
        "model": TabularQ,
        "environment": GridWorld,
        "num_parallel": 2,
        "total_steps": 100,
        "model_kwargs": model_kwargs,
        "env_kwargs": env_kwargs,
        "action_sampling_type": "epsilon_greedy",
        "epsilon": 1

        # and more
    }

    # initilize
    ray.init(log_to_driver=False)
    manager = SampleManager(**kwargs)
    saving_path = os.getcwd() + "/progress_test"

    print("Testing before training: ")
    manager.test(
        max_steps=100,
        test_episodes=10,
        render=True,
        do_print=True,
        evaluation_measure="time_and_reward",
    )
    episodes = 40
    saving_after = 5
    max_steps = 20

    from collections import deque
Пример #13
0
    model_kwargs = {
        'observation_space': 4,
        'action_space': 2
    }

    kwargs = {
        'model': VanillaDeepQNetwork,
        'environment': ENV_NAME,
        'num_parallel': 2,
        'total_steps': SAMPLE_SIZE,
        'model_kwargs':model_kwargs,
        "action_sampling_type": "epsilon_greedy",
        "epsilon": EPSILON
    }

    manager = SampleManager(**kwargs)

    # specify where to save results and ensure that the folder exists
    saving_path = Path(os.getcwd() + SAVING_DIRECTORY)
    saving_path.mkdir(parents=True, exist_ok=True)
    saving_path_model = Path(os.getcwd() + SAVING_DIRECTORY + '/model')
    saving_path_model.mkdir(parents=True, exist_ok=True)

    # initialize manager
    optim_keys = ['state', 'action', 'reward', 'state_new', 'not_done']
    manager.initilize_buffer(BUFFER_SIZE, optim_keys)
    aggregator_keys=['loss', 'time_steps', 'reward']
    manager.initialize_aggregator(saving_path, 5, aggregator_keys)

    # initialize the optimizers
    optimizer = Adam(learning_rate=LEARNING_RATE)
Пример #14
0
    kwargs = {
        "model": TabularQ,
        "environment": GridWorld,
        "num_parallel": 4,
        "total_steps": 1000,
        "model_kwargs": model_kwargs,
        "env_kwargs": env_kwargs,
        # and more: action sampling strategy
        "action_sampling_type": "epsilon_greedy",
        "epsilon": 1,
    }

    # initilize
    ray.init(log_to_driver=False)
    manager = SampleManager(**kwargs)

    # where to save results
    saving_path = os.getcwd() + "\progress_tabularq"

    # keys for replay buffer -> what you will need for optimization
    optim_keys = ["state", "action", "reward", "state_new", "not_done"]

    # initialize buffer
    manager.initilize_buffer(buffer_size, optim_keys)

    # initilize progress aggregator
    manager.initialize_aggregator(
        path=saving_path, saving_after=5, aggregator_keys=["loss", "time_steps"]
    )
Пример #15
0
    #tf.keras.backend.clear_session()

    # define kwargs for model
    kwargs = {
        "model": DQN_Model,
        "environment": "CartPole-v0",
        "num_parallel": 2,
        "total_steps": 100,
        "action_sampling_type": "epsilon_greedy",
        "num_episodes": 20,
        "epsilon": 0.90
    }

    # initialize ray, manager, saving path
    ray.init(log_to_driver=False)
    manager = SampleManager(**kwargs)
    saving_path = os.getcwd() + "\\progress_test_HW2"
    if not os.path.exists(saving_path):
        os.mkdir(saving_path)

    # initilialize parameters, buffer and aggregator
    gamma = 0.99

    buffer_size = 5000
    test_steps = 1000
    epochs = 20
    sample_size = 1000
    optim_batch_size = 8
    saving_after = 5

    optim_keys = ['state', 'action', 'reward', 'state_new', 'not_done']
Пример #16
0
    }

    # you can also create your environment like this after installation: env = gym.make('gridworld-v0')
    env = GridWorld(**env_kwargs)

    model_kwargs = {"h": env.height, "w": env.width, "action_space": 4}

    kwargs = {
        "model": TabularQ,
        "environment": GridWorld,
        "num_parallel": 2,
        "total_steps": 100,
        "model_kwargs": model_kwargs
        # and more
    }

    # initilize
    ray.init(log_to_driver=False)
    manager = SampleManager(**kwargs)

    print("test before training: ")
    manager.test(
        max_steps=100,
        test_episodes=10,
        render=True,
        do_print=True,
        evaluation_measure="time_and_reward",
    )

    # do the rest!!!!
        "num_parallel": 2,
        "total_steps": 500,
        "model_kwargs": model_kwargs,
        "action_sampling_type": "epsilon_greedy",
        "epsilon": epsilon
    }

    # Initialize the loss function
    loss_function = tf.keras.losses.MeanSquaredError()

    # Initialize the optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate)

    # Initialize
    ray.init(log_to_driver=False)
    manager = SampleManager(**kwargs)

    # Where to save your results to: create this directory in advance!
    saving_path = os.getcwd() + "/progress_CartPole"

    # Initialize buffer
    manager.initilize_buffer(buffer_size)

    # Initialize progress aggregator
    manager.initialize_aggregator(path=saving_path,
                                  saving_after=5,
                                  aggregator_keys=["loss", 'reward'])

    rewards = []

    # Get initial agent
Пример #18
0
    loss_function = loss_function = tf.keras.losses.MSE
    epsilon = 1

    kwargs = {
        "model": MyModel,
        "environment": "CartPole-v0",
        "num_parallel": 5,
        "total_steps": 2000,
        "action_sampling_type": "epsilon_greedy",
        "num_episodes": 20,
        "epsilon": epsilon,
    }

    ray.init(log_to_driver=False)

    manager = SampleManager(**kwargs)
    # where to save your results to: create this directory in advance!
    saving_path = os.getcwd() + "/progress_cartpole"

    # keys for replay buffer -> what you will need for optimization
    optim_keys = ["state", "action", "reward", "state_new", "not_done"]

    # initialize buffer
    manager.initilize_buffer(buffer_size, optim_keys)

    # initilize progress aggregator
    manager.initialize_aggregator(path=saving_path,
                                  saving_after=5,
                                  aggregator_keys=["loss", "time_steps"])

    # initial testing:
if __name__ == "__main__":

    # define kwargs for model
    kwargs = {
        "model": ActorCriticAgent,
        "environment": "LunarLanderContinuous-v2",
        "num_parallel": 2,
        "total_steps": 100,
        "action_sampling_type": "continuous_normal_diagonal",
        "returns": ['monte_carlo', 'value_estimate']
        #"num_episodes": 20
    }

    # initialize ray, manager, saving path
    ray.init(log_to_driver=False)
    manager = SampleManager(**kwargs)
    saving_path = os.getcwd() + "\\progress_test_HW3"
    if not os.path.exists(saving_path):
        os.mkdir(saving_path)

    # initialize parameters and aggregator
    buffer_size = 5000
    test_steps = 100
    epochs = 30
    sample_size = 1000
    optim_batch_size = 1
    saving_after = 5
    training = True

    optim_keys = [
        'state', 'action', 'reward', 'state_new', 'not_done', 'value_estimate'
Пример #20
0
        "total_steps": 420,
        "returns": ['value_estimate', 'log_prob', 'monte_carlo'],
        "model_kwargs": model_kwargs,
        "action_sampling_type": "continuous_normal_diagonal",
        "gamma": gamma
    }

    # Initialize the loss function
    mse_loss = tf.keras.losses.MeanSquaredError()

    # Initialize the optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate)

    # Initialize
    ray.init(log_to_driver=False)
    manager = SampleManager(**kwargs)

    # Where to save your results to: create this directory in advance!
    saving_path = os.getcwd() + "/progress_LunarLanderContinuous"

    # Initialize progress aggregator
    manager.initialize_aggregator(path=saving_path,
                                  saving_after=5,
                                  aggregator_keys=["loss", 'reward', 'time'])

    rewards = []

    # Get initial agent
    agent = manager.get_agent()

    print('TRAINING')
Пример #21
0
        layers = [8, 8]
        k = 8
        # fixed random net used to obtain features
        target_network = ppo_model.TargetNetwork(layers, k, state_dim)
        target_network.trainable = False
        # predictor network we will train to match target network
        predictor = ppo_model.TargetNetwork(layers, k, state_dim)

        pred_optimizer = tf.keras.optimizers.Adam(learning_rate)

    # Instantiate and Initialize Sample Manager
    manager = SampleManager(
        model=ppo_model.A2C,
        environment=env_name,
        num_parallel=3,
        total_steps=420,
        returns=['value_estimate', 'log_prob', 'monte_carlo'],
        model_kwargs=model_kwargs,
        action_sampling_type="continuous_normal_diagonal",
        use_ray=use_ray)

    # ----------------------          IO         ----------------------
    """
    This section saves plots of the training process, writes some details to csv, 
    and allow to continue training from an existing models.
    """
    saving_path = os.getcwd() + "/" + env_name
    manager.initialize_aggregator(path=saving_path,
                                  saving_after=5,
                                  aggregator_keys=["loss", 'reward', 'time'])