def main(args): hidden_units = args.hidden_units msg_dim = args.msg_dim model_path = os.getcwd() + "/" + args.model_dir ray.init(log_to_driver=False) env_test_instance = gym.make('BipedalWalker-v3') if args.baseline: from smp.baseline import TD3Net action_dimension = copy(env_test_instance.action_space.shape[0]) else: from smp.smp import TD3Net action_dimension = 1 model_kwargs = { # action dimension for modular actions 'action_dimension': action_dimension, 'min_action': copy(env_test_instance.action_space.low)[0], 'max_action': copy(env_test_instance.action_space.high)[0], 'msg_dimension': msg_dim, 'fix_sigma': True, 'hidden_units': hidden_units } del env_test_instance manager = SampleManager(TD3Net, 'BipedalWalker-v3', num_parallel=(os.cpu_count() - 1), total_steps=150, action_sampling_type="continuous_normal_diagonal", is_tf=True, model_kwargs=model_kwargs) manager.load_model(model_path) manager.test(200, test_episodes=5, render=True) ray.shutdown()
'environment': GridWorld, 'env_kwargs': env_kwargs, 'num_parallel': 8, 'total_steps': 100, 'action_sampling_type': 'epsilon_greedy', 'model_kwargs': model_kwargs, 'input_shape': False, # no input shape needed for getting first weights 'weights': weights, 'num_episodes': 10, 'epsilon': 0.8, 'is_tf': False } ray.init(log_to_driver=False) manager = SampleManager(**kwargs) saving_path = os.getcwd() + '/progress_tabq' saving_after = 5 # prameters for optimization buffer_size = 500 test_steps = 100 epochs = 10 sample_size = 500 # training steps per epoch # discount gamma = 0.95 learning_rate = 0.2 # keys needed for tabular q optim_keys = ['state', 'action', 'reward', 'state_new', 'not_done']
return output def get_config(self): return super(PPONet, self).get_config() if __name__ == "__main__": # initialize ray.init(log_to_driver=False) manager = SampleManager( PPONet, 'LunarLanderContinuous-v2', num_parallel=3, total_steps=150, action_sampling_type="continuous_normal_diagonal", #todo check if monte carlo is correct #todo what about gamma?? returns=['monte_carlo', 'value_estimate', 'log_prob']) epochs = 30 saving_path = os.getcwd() + "/hw3_results" saving_after = 5 sample_size = 150 optim_batch_size = 8 gamma = .99 test_steps = 1000 # Factor of how much the new policy is allowed to differ from the old one epsilon = 0.2 entropy_weight = 0.01
env = GridWorld(**env_kwargs) model_kwargs = {"h": env.height, "w": env.width, "action_space": 4} kwargs = { "model": TabularQ, "environment": GridWorld, "num_parallel": 2, "total_steps": 20, "model_kwargs": model_kwargs, "env_kwargs": env_kwargs } # initializing ray ray.init(log_to_driver=False) manager = SampleManager(**kwargs) saving_path = os.getcwd() + "/progress_test" epochs = 10 buffer_size = 5000 test_steps = 50 sample_size = 1000 saving_after = 5 alpha = 0.1 gamma = 0.95 optim_keys = ["state", "action", "reward", "state_new", "not_done"] # initialize buffer
def train_td3(args, model, action_dimension=None): print(args) tf.keras.backend.set_floatx('float32') ray.init(log_to_driver=False) # hyper parameters buffer_size = args.buffer_size # 10e6 in their repo, not possible with our ram epochs = args.epochs saving_path = os.getcwd() + "/" + args.saving_dir saving_after = 5 sample_size = args.sample_size optim_batch_size = args.batch_size gamma = args.gamma test_steps = 100 # 1000 in their repo policy_delay = 2 rho = .046 policy_noise = args.policy_noise policy_noise_clip = .5 msg_dim = args.msg_dim # 32 in their repo learning_rate = args.learning_rate save_args(args, saving_path) env_test_instance = gym.make('BipedalWalker-v3') if action_dimension is None: action_dimension = copy(env_test_instance.action_space.shape[0]) model_kwargs = { # action dimension for modular actions 'action_dimension': action_dimension, 'min_action': copy(env_test_instance.action_space.low)[0], 'max_action': copy(env_test_instance.action_space.high)[0], 'msg_dimension': msg_dim, 'fix_sigma': True, 'hidden_units': args.hidden_units } del env_test_instance manager = SampleManager(model, 'BipedalWalker-v3', num_parallel=(os.cpu_count() - 1), total_steps=150, action_sampling_type="continuous_normal_diagonal", is_tf=True, model_kwargs=model_kwargs) optim_keys = [ 'state', 'action', 'reward', 'state_new', 'not_done', ] manager.initialize_buffer(buffer_size, optim_keys) manager.initialize_aggregator(path=saving_path, saving_after=saving_after, aggregator_keys=["loss", "reward"]) agent = manager.get_agent() optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) # fill buffer print("Filling buffer before training..") while len(manager.buffer.buffer[ manager.buffer.keys[0]]) < manager.buffer.size: # Gives you state action reward trajectories data = manager.get_data() manager.store_in_buffer(data) # track time while training timer = time.time() last_t = timer target_agent = manager.get_agent() for e in range(epochs): # off policy sample_dict = manager.sample(sample_size, from_buffer=True) print(f"collected data for: {sample_dict.keys()}") # cast values to float32 and create data dict sample_dict['state'] = tf.cast(sample_dict['state'], tf.float32) sample_dict['action'] = tf.cast(sample_dict['action'], tf.float32) sample_dict['reward'] = tf.cast(sample_dict['reward'], tf.float32) sample_dict['state_new'] = tf.cast(sample_dict['state_new'], tf.float32) sample_dict['not_done'] = tf.cast(sample_dict['not_done'], tf.float32) data_dict = dict_to_dict_of_datasets(sample_dict, batch_size=optim_batch_size) total_loss = 0 for state, action, reward, state_new, not_done in \ zip(data_dict['state'], data_dict['action'], data_dict['reward'], data_dict['state_new'], data_dict['not_done']): action_new = target_agent.act(state_new) # add noise to action_new action_new = action_new + tf.clip_by_value( tf.random.normal(action_new.shape, 0., policy_noise), -policy_noise_clip, policy_noise_clip) # clip action_new to action space action_new = tf.clip_by_value( action_new, manager.env_instance.action_space.low, manager.env_instance.action_space.high) # calculate target with double-Q-learning state_action_new = tf.concat([state_new, action_new], axis=-1) q_values0 = target_agent.model.critic0(state_action_new) q_values1 = target_agent.model.critic1(state_action_new) q_values = tf.concat([q_values0, q_values1], axis=-1) q_targets = tf.squeeze(tf.reduce_min(q_values, axis=-1)) critic_target = reward + gamma * not_done * q_targets state_action = tf.concat([state, action], axis=-1) # update critic 0 with tf.GradientTape() as tape: q_output = agent.model.critic0(state_action) loss = tf.keras.losses.MSE(tf.squeeze(critic_target), tf.squeeze(q_output)) total_loss += loss gradients = tape.gradient(loss, agent.model.critic0.trainable_variables) optimizer.apply_gradients( zip(gradients, agent.model.critic0.trainable_variables)) # update critic 1 with tf.GradientTape() as tape: q_output = agent.model.critic1(state_action) loss = tf.keras.losses.MSE(tf.squeeze(critic_target), tf.squeeze(q_output)) total_loss += loss gradients = tape.gradient(loss, agent.model.critic1.trainable_variables) optimizer.apply_gradients( zip(gradients, agent.model.critic1.trainable_variables)) # update actor with delayed policy update if e % policy_delay == 0: with tf.GradientTape() as tape: actor_output = agent.model.actor(state) action = reparam_action(actor_output, agent.model.action_dimension, agent.model.min_action, agent.model.max_action) state_action = tf.concat([state, action], axis=-1) q_val = agent.model.critic0(state_action) actor_loss = -tf.reduce_mean(q_val) total_loss += actor_loss actor_gradients = tape.gradient( actor_loss, agent.model.actor.trainable_variables) optimizer.apply_gradients( zip(actor_gradients, agent.model.actor.trainable_variables)) # Update agent manager.set_agent(agent.get_weights()) agent = manager.get_agent() if e % policy_delay == 0: # Polyak averaging new_weights = list(rho * np.array(target_agent.get_weights()) + (1. - rho) * np.array(agent.get_weights())) target_agent.set_weights(new_weights) reward = manager.test(test_steps, evaluation_measure="reward") manager.update_aggregator(loss=total_loss, reward=reward) print( f"epoch ::: {e} loss ::: {total_loss} avg reward ::: {np.mean(reward)}" ) if e % saving_after == 0: manager.save_model(saving_path, e) # needed time and remaining time estimation current_t = time.time() time_needed = (current_t - last_t) / 60. time_remaining = (current_t - timer) / 60. / (e + 1) * (epochs - (e + 1)) print( 'Finished epoch %d of %d. Needed %1.f min for this epoch. Estimated time remaining: %.1f min' % (e + 1, epochs, time_needed, time_remaining)) last_t = current_t manager.load_model(saving_path) print("done") print("testing optimized agent") manager.test(test_steps, test_episodes=10, render=True) ray.shutdown()
loss_function = tf.keras.losses.MSE num_episodes = 1 kwargs = { "model": ActorCritic, "environment": "LunarLanderContinuous-v2", "num_parallel": 1, "total_steps": 1000, "action_sampling_type": "continuous_normal_diagonal", "num_steps": 1000, "returns": ['value_estimate', 'log_prob', 'monte_carlo'] } ray.init(log_to_driver=False) manager = SampleManager(**kwargs) # where to save your results to: create this directory in advance! saving_path = os.getcwd() + "\progress_a2c" buffer_size = 1000 # not used test_steps = 500 epochs = 20 sample_size = 1000 optim_batch_size = 1000 saving_after = 10 # keys for replay buffer needed for optimization optim_keys = [ "state", "action", "reward", "state_new", "not_done", "monte_carlo" ]
optimizer = tf.keras.optimizers.Adam(learning_rate=alpha) epsilon = 1 kwargs = { "model": ModelContunous, "environment": "LunarLanderContinuous-v2", "num_parallel": 4, # runner boxes "total_steps": 2000, # amouint of maximal steps of each runner "action_sampling_type": "continuous_normal_diagonal", "num_episodes": 50, # num_episodes per runner box "epsilon": epsilon, } ray.init(log_to_driver=False) manager = SampleManager(**kwargs) # where to save your results to: create this directory in advance! saving_path = os.getcwd() + "/progress_lunar" # keys for replay buffer -> what you will need for optimization optim_keys = ["state", "action", "reward", "state_new", "not_done"] # initialize buffer manager.initilize_buffer(buffer_size, optim_keys) # initilize progress aggregator manager.initialize_aggregator(path=saving_path, saving_after=5, aggregator_keys=["loss", "time_steps"]) # initial testing:
print('Prepare CartPole') env = gym.make("CartPole-v1") model_kwargs = {"layers": [16, 16, 16], "num_actions": env.action_space.n} kwargs = { "model": QNet, "environment": "CartPole-v1", "num_parallel": 1, "total_steps": 1000, "model_kwargs": model_kwargs, } # Initialize ray.init(log_to_driver=False) manager = SampleManager(**kwargs) # Where to load your results from loading_path = os.getcwd() + "/progress_CartPole" # Load model manager.load_model(loading_path) print("done") print("testing optimized agent") manager.test( 1000, test_episodes=10, render=True, do_print=True, evaluation_measure="time_and_reward", )
model_kwargs = {"h": env.height, "w": env.width, "action_space": 4} kwargs = { "model": TabularQ, "environment": GridWorld, "num_parallel": 2, "total_steps": 100, "model_kwargs": model_kwargs, "env_kwargs": env_kwargs # and more } # initilize ray.init(log_to_driver=False) manager = SampleManager(**kwargs) print("test before training: ") manager.test( max_steps=100, test_episodes=10, render=True, do_print=True, evaluation_measure="time_and_reward", ) # some parameters epochs = 100 gamma = 0.85 learning_rate = 0.2
hidden = self.d2(hidden) hidden = self.d3(hidden) q = self.dout(hidden) output["q_values"] = q return output if __name__ == "__main__": tf.keras.backend.set_floatx('float64') # initialize ray.init(log_to_driver=False) manager = SampleManager(DQN, 'CartPole-v0', num_parallel=3, total_steps=100, action_sampling_type="thompson") buffer_size = 2000 epochs = 100 saving_path = os.getcwd() + "/progress_dqn" saving_after = 5 sample_size = 100 optim_batch_size = 8 gamma = .98 update_interval = 4 test_steps = 1000 temperature = 1.5 temperature_update = 0.98 #new_temp = old_temp*temp_update temperature_min = 0.5
} kwargs = { "model": TabularQ, "environment": GridWorld, "num_parallel": 3, "total_steps": 2000, "model_kwargs": model_kwargs, "env_kwargs": env_kwargs, "num_episodes": 5 # and more } # Initialize ray.init(log_to_driver=False) manager = SampleManager(**kwargs) print("test before training: ") manager.test( max_steps=100, test_episodes=1, render=True, do_print=True, evaluation_measure="time_and_reward", ) # Do the rest!!!! # Where to save your results to: create this directory in advance! saving_path = os.getcwd() + "/progress_TabQ"
kwargs = { "model": TabularQ, "environment": GridWorld, "num_parallel": 2, "total_steps": 100, "model_kwargs": model_kwargs, "env_kwargs": env_kwargs, "action_sampling_type": "epsilon_greedy", "epsilon": 1 # and more } # initilize ray.init(log_to_driver=False) manager = SampleManager(**kwargs) saving_path = os.getcwd() + "/progress_test" print("Testing before training: ") manager.test( max_steps=100, test_episodes=10, render=True, do_print=True, evaluation_measure="time_and_reward", ) episodes = 40 saving_after = 5 max_steps = 20 from collections import deque
model_kwargs = { 'observation_space': 4, 'action_space': 2 } kwargs = { 'model': VanillaDeepQNetwork, 'environment': ENV_NAME, 'num_parallel': 2, 'total_steps': SAMPLE_SIZE, 'model_kwargs':model_kwargs, "action_sampling_type": "epsilon_greedy", "epsilon": EPSILON } manager = SampleManager(**kwargs) # specify where to save results and ensure that the folder exists saving_path = Path(os.getcwd() + SAVING_DIRECTORY) saving_path.mkdir(parents=True, exist_ok=True) saving_path_model = Path(os.getcwd() + SAVING_DIRECTORY + '/model') saving_path_model.mkdir(parents=True, exist_ok=True) # initialize manager optim_keys = ['state', 'action', 'reward', 'state_new', 'not_done'] manager.initilize_buffer(BUFFER_SIZE, optim_keys) aggregator_keys=['loss', 'time_steps', 'reward'] manager.initialize_aggregator(saving_path, 5, aggregator_keys) # initialize the optimizers optimizer = Adam(learning_rate=LEARNING_RATE)
kwargs = { "model": TabularQ, "environment": GridWorld, "num_parallel": 4, "total_steps": 1000, "model_kwargs": model_kwargs, "env_kwargs": env_kwargs, # and more: action sampling strategy "action_sampling_type": "epsilon_greedy", "epsilon": 1, } # initilize ray.init(log_to_driver=False) manager = SampleManager(**kwargs) # where to save results saving_path = os.getcwd() + "\progress_tabularq" # keys for replay buffer -> what you will need for optimization optim_keys = ["state", "action", "reward", "state_new", "not_done"] # initialize buffer manager.initilize_buffer(buffer_size, optim_keys) # initilize progress aggregator manager.initialize_aggregator( path=saving_path, saving_after=5, aggregator_keys=["loss", "time_steps"] )
#tf.keras.backend.clear_session() # define kwargs for model kwargs = { "model": DQN_Model, "environment": "CartPole-v0", "num_parallel": 2, "total_steps": 100, "action_sampling_type": "epsilon_greedy", "num_episodes": 20, "epsilon": 0.90 } # initialize ray, manager, saving path ray.init(log_to_driver=False) manager = SampleManager(**kwargs) saving_path = os.getcwd() + "\\progress_test_HW2" if not os.path.exists(saving_path): os.mkdir(saving_path) # initilialize parameters, buffer and aggregator gamma = 0.99 buffer_size = 5000 test_steps = 1000 epochs = 20 sample_size = 1000 optim_batch_size = 8 saving_after = 5 optim_keys = ['state', 'action', 'reward', 'state_new', 'not_done']
} # you can also create your environment like this after installation: env = gym.make('gridworld-v0') env = GridWorld(**env_kwargs) model_kwargs = {"h": env.height, "w": env.width, "action_space": 4} kwargs = { "model": TabularQ, "environment": GridWorld, "num_parallel": 2, "total_steps": 100, "model_kwargs": model_kwargs # and more } # initilize ray.init(log_to_driver=False) manager = SampleManager(**kwargs) print("test before training: ") manager.test( max_steps=100, test_episodes=10, render=True, do_print=True, evaluation_measure="time_and_reward", ) # do the rest!!!!
"num_parallel": 2, "total_steps": 500, "model_kwargs": model_kwargs, "action_sampling_type": "epsilon_greedy", "epsilon": epsilon } # Initialize the loss function loss_function = tf.keras.losses.MeanSquaredError() # Initialize the optimizer optimizer = tf.keras.optimizers.Adam(learning_rate) # Initialize ray.init(log_to_driver=False) manager = SampleManager(**kwargs) # Where to save your results to: create this directory in advance! saving_path = os.getcwd() + "/progress_CartPole" # Initialize buffer manager.initilize_buffer(buffer_size) # Initialize progress aggregator manager.initialize_aggregator(path=saving_path, saving_after=5, aggregator_keys=["loss", 'reward']) rewards = [] # Get initial agent
loss_function = loss_function = tf.keras.losses.MSE epsilon = 1 kwargs = { "model": MyModel, "environment": "CartPole-v0", "num_parallel": 5, "total_steps": 2000, "action_sampling_type": "epsilon_greedy", "num_episodes": 20, "epsilon": epsilon, } ray.init(log_to_driver=False) manager = SampleManager(**kwargs) # where to save your results to: create this directory in advance! saving_path = os.getcwd() + "/progress_cartpole" # keys for replay buffer -> what you will need for optimization optim_keys = ["state", "action", "reward", "state_new", "not_done"] # initialize buffer manager.initilize_buffer(buffer_size, optim_keys) # initilize progress aggregator manager.initialize_aggregator(path=saving_path, saving_after=5, aggregator_keys=["loss", "time_steps"]) # initial testing:
if __name__ == "__main__": # define kwargs for model kwargs = { "model": ActorCriticAgent, "environment": "LunarLanderContinuous-v2", "num_parallel": 2, "total_steps": 100, "action_sampling_type": "continuous_normal_diagonal", "returns": ['monte_carlo', 'value_estimate'] #"num_episodes": 20 } # initialize ray, manager, saving path ray.init(log_to_driver=False) manager = SampleManager(**kwargs) saving_path = os.getcwd() + "\\progress_test_HW3" if not os.path.exists(saving_path): os.mkdir(saving_path) # initialize parameters and aggregator buffer_size = 5000 test_steps = 100 epochs = 30 sample_size = 1000 optim_batch_size = 1 saving_after = 5 training = True optim_keys = [ 'state', 'action', 'reward', 'state_new', 'not_done', 'value_estimate'
"total_steps": 420, "returns": ['value_estimate', 'log_prob', 'monte_carlo'], "model_kwargs": model_kwargs, "action_sampling_type": "continuous_normal_diagonal", "gamma": gamma } # Initialize the loss function mse_loss = tf.keras.losses.MeanSquaredError() # Initialize the optimizer optimizer = tf.keras.optimizers.Adam(learning_rate) # Initialize ray.init(log_to_driver=False) manager = SampleManager(**kwargs) # Where to save your results to: create this directory in advance! saving_path = os.getcwd() + "/progress_LunarLanderContinuous" # Initialize progress aggregator manager.initialize_aggregator(path=saving_path, saving_after=5, aggregator_keys=["loss", 'reward', 'time']) rewards = [] # Get initial agent agent = manager.get_agent() print('TRAINING')
layers = [8, 8] k = 8 # fixed random net used to obtain features target_network = ppo_model.TargetNetwork(layers, k, state_dim) target_network.trainable = False # predictor network we will train to match target network predictor = ppo_model.TargetNetwork(layers, k, state_dim) pred_optimizer = tf.keras.optimizers.Adam(learning_rate) # Instantiate and Initialize Sample Manager manager = SampleManager( model=ppo_model.A2C, environment=env_name, num_parallel=3, total_steps=420, returns=['value_estimate', 'log_prob', 'monte_carlo'], model_kwargs=model_kwargs, action_sampling_type="continuous_normal_diagonal", use_ray=use_ray) # ---------------------- IO ---------------------- """ This section saves plots of the training process, writes some details to csv, and allow to continue training from an existing models. """ saving_path = os.getcwd() + "/" + env_name manager.initialize_aggregator(path=saving_path, saving_after=5, aggregator_keys=["loss", 'reward', 'time'])