def learn_and_evaluate(self): workers_id = [] batch_size = self.parms['training_episodes'] // self.parms['workers'][0] for _ in range(self.parms['workers'][0]): workers_id.append(collecting_worker.remote(self.env, self.model_server, self.memory_server, batch_size)) all_results = [] if self.parms['do_test']: eval_model = DQNModel(len(env.reset()), len(ACTION_DICT)) learn_done, filedir = False, "" workers_num = self.parms['workers'][1] interval = self.parms['test_interval']//workers_num while not learn_done: filedir, learn_done = ray.get(self.memory_server.get_evaluate_filedir.remote()) if not filedir: continue eval_model.load(filedir) start_time, total_reward = time.time(), 0 eval_workers = [] for _ in range(workers_num): eval_workers.append(evaluation_worker_test2.remote(self.env, self.memory_server, eval_model, interval)) avg_reward = sum(ray.get(eval_workers))/workers_num print(filedir, avg_reward, (time.time() - start_time)) all_results.append(avg_reward) return all_results
def __init__(self, env, hyper_params, memory, action_space): self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] self.final_epsilon = hyper_params['final_epsilon'] self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.beta = hyper_params['beta'] self.model_replace_freq = hyper_params['model_replace_freq'] self.learning_rate = hyper_params['learning_rate'] self.training_episodes = hyper_params['training_episodes'] self.test_interval = hyper_params['test_interval'] self.memory = memory self.episode = 0 self.steps = 0 self.result_count = 0 self.next = 0 self.batch_num = self.training_episodes // self.test_interval state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.target_model = DQNModel(input_len, output_len) self.results = [0] * (self.batch_num + 1) self.previous_q_networks = [] self.collector_done = False self.evaluator_done = False
def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)): self.env = env self.max_episode_steps = env._max_episode_steps self.beta = hyper_params['beta'] self.initial_epsilon = 1 self.final_epsilon = hyper_params['final_epsilon'] self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] self.episode = 0 self.steps = 0 self.best_reward = 0 self.learning = True self.action_space = action_space state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.use_target_model = hyper_params['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) self.memory = ReplayBuffer(hyper_params['memory_size']) self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq']
def __init__(self, env, hyper_params, batch_size, update_steps, memory_size, beta, model_replace_freq, learning_rate, use_target_model=True, memory=Memory_Server, action_space=2, training_episodes=7000, test_interval=50): # super().__init__(update_steps, memory_size, model_replace_freq, learning_rate, beta=0.99, batch_size = 32, use_target_model=True) self.batch_size = batch_size state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=0.0003) self.target_model = DQNModel(input_len, output_len) self.steps = 0 self.memory = memory # self.memory = ReplayBuffer(hyper_params['memory_size']) self.prev = 0 self.next = 0 self.model_dq = deque() self.result = [0] * ((training_episodes // test_interval) + 1) self.previous_q_networks = [] self.result_count = 0 self.learning_episodes = training_episodes self.episode = 0 self.is_collection_completed = False self.evaluator_done = False self.batch_num = training_episodes // test_interval self.use_target_model = True self.beta = 0.99 self.test_interval = test_interval
def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)): self.env = env self.max_episode_steps = env._max_episode_steps """ beta: The discounted factor of Q-value function (epsilon): The explore or exploit policy epsilon. initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1 final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps', The epsilon set to the 'final_epsilon' determinately. epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'. """ self.beta = hyper_params['beta'] self.initial_epsilon = 1 self.final_epsilon = hyper_params['final_epsilon'] self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] """ episode: Record training episode steps: Add 1 when predicting an action learning: The trigger of agent learning. It is on while training agent. It is off while testing agent. action_space: The action space of the current environment, e.g 2. """ self.episode = 0 self.steps = 0 self.best_reward = 0 self.learning = True self.action_space = action_space """ input_len The input length of the neural network. It equals to the length of the state vector. output_len: The output length of the neural network. It is equal to the action space. eval_model: The model for predicting action for the agent. target_model: The model for calculating Q-value of next_state to update 'eval_model'. use_target_model: Trigger for turn 'target_model' on/off """ state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.use_target_model = hyper_params['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) # memory: Store and sample experience replay. self.memory = ReplayBuffer(hyper_params['memory_size']) """ batch_size: Mini batch size for training model. update_steps: The frequence of traning model model_replace_freq: The frequence of replacing 'target_model' by 'eval_model' """ self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq'] print("agent initialized")
class Model_Server(object): def __init__(self, env, hyper_params, memory_server): """ input_len The input length of the neural network. It equals to the length of the state vector. output_len: The output length of the neural network. It is equal to the action space. eval_model: The model for predicting action for the agent. target_model: The model for calculating Q-value of next_state to update 'eval_model'. use_target_model: Trigger for turn 'target_model' on/off """ self.beta = hyper_params['beta'] state = env.reset() action_space = len(ACTION_DICT) input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.use_target_model = hyper_params['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) self.memory_server = memory_server def update_batch(self, batch_size): batch = ray.get(self.memory_server.sample.remote(batch_size)) (states, actions, reward, next_states, is_terminal) = batch states = states next_states = next_states terminal = FloatTensor([0 if t else 1 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(batch_size, dtype=torch.long) # Current Q Values _, q_values = self.eval_model.predict_batch(states) q_values = q_values[batch_index, actions] # Calculate target if self.use_target_model: actions, q_next = self.target_model.predict_batch(next_states) q_next = q_next[batch_index, actions] else: actions, q_next = self.eval_model.predict_batch(next_states) q_next = q_next[batch_index, actions] #INSERT YOUR CODE HERE --- neet to compute 'q_targets' used below q_max = q_next * terminal q_target = reward + self.beta * q_max # update model self.eval_model.fit(q_values, q_target) def replace_target(self): self.target_model.replace(self.eval_model) def greedy_policy(self, state): return self.eval_model.predict(state)
def evaluation_worker(env, mem_server, trials): eval_model = DQNModel(len(env.reset()), len(ACTION_DICT)) learn_done, filedir = False, "" while not learn_done: filedir, learn_done = ray.get(mem_server.get_evaluate_filedir.remote()) if not filedir: continue eval_model.load(filedir) start_time, total_reward = time.time(), 0 for _ in range(trials): state, done, steps = env.reset(), False, 0 while steps < env._max_episode_steps and not done: steps += 1 state, reward, done, _ = env.step(eval_model.predict(state)) total_reward += reward mem_server.add_results.remote(total_reward / trials)
def __init__(self, env, memory, action_space=2, test_interval=50): self.collector_done = False self.evaluator_done = False self.env = env # self.max_episode_steps = env._max_episode_steps self.max_episode_steps = 200 self.beta = hyperparams_CartPole['beta'] self.initial_epsilon = 1 self.final_epsilon = hyperparams_CartPole['final_epsilon'] self.epsilon_decay_steps = hyperparams_CartPole['epsilon_decay_steps'] self.batch_size = hyperparams_CartPole['batch_size'] self.episode = 0 self.steps = 0 self.best_reward = 0 self.learning = True self.action_space = action_space self.previous_q_models = [] self.results = [0] * (self.batch_size + 1) self.reuslt_count = 0 self.episode = 0 self.test_interval = test_interval self.memory = memory state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyperparams_CartPole['learning_rate']) self.use_target_model = hyperparams_CartPole['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) # # memory: Store and sample experience replay. # self.memory = ReplayBuffer(hyper_params['memory_size']) self.batch_size = hyperparams_CartPole['batch_size'] self.update_steps = hyperparams_CartPole['update_steps'] self.model_replace_freq = hyperparams_CartPole['model_replace_freq']
def __init__(self, env, hyper_params, memory_server): """ input_len The input length of the neural network. It equals to the length of the state vector. output_len: The output length of the neural network. It is equal to the action space. eval_model: The model for predicting action for the agent. target_model: The model for calculating Q-value of next_state to update 'eval_model'. use_target_model: Trigger for turn 'target_model' on/off """ self.beta = hyper_params['beta'] state = env.reset() action_space = len(ACTION_DICT) input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.use_target_model = hyper_params['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) self.memory_server = memory_server
def __init__(self, hyper_params, memory_server, nb_agents, nb_evaluators, action_space=len(ACTION_DICT)): self.beta = hyper_params['beta'] self.initial_epsilon = 1 self.final_epsilon = hyper_params['final_epsilon'] self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] self.hyper_params = hyper_params self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq'] self.action_space = action_space self.batch_size = hyper_params['batch_size'] self.memory_server = memory_server self.nb_agents = nb_agents self.nb_evaluators = nb_evaluators env = CartPoleEnv() state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.target_model = DQNModel(input_len, output_len) self.agents = [ DQN_agent_remote.remote(CartPoleEnv(), memory_server, hyper_params, action_space, i) for i in range(nb_agents) ] self.evaluators = [ EvalWorker.remote(self.eval_model, CartPoleEnv(), hyper_params['max_episode_steps'], hyper_params['eval_trials'], i) for i in range(nb_evaluators) ]
def __init__(self, learning_rate, training_episodes, memory, env, test_interval=50, batch_size=32, action_space=len(ACTION_DICT), beta=0.99): self.env = env #self.max_episode_steps = env._max_episode_steps self.batch_num = training_episodes // test_interval self.steps = 0 self.collector_done = False self.evaluator_done = False self.training_episodes = training_episodes self.episode = 0 #self.esults = [] self.batch_size = batch_size self.privous_q_model = [] self.results = [0] * (self.batch_num + 1) self.result_count = 0 self.memory = memory self.use_target_model = True state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=0.0003) self.target_model = DQNModel(input_len, output_len) self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq']
def __init__(self, env, hyper_params, memo_server): self.memory_server = memo_server self.env = env self.max_episode_steps = env._max_episode_steps self.beta = hyper_params['beta'] self.training_episodes = hyper_params['training_episodes'] self.test_interval = hyper_params['test_interval'] action_space = len(ACTION_DICT) self.episode = 0 self.steps = 0 self.best_reward = 0 self.learning = True self.action_space = action_space state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.use_target_model = hyper_params['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq'] self.collector_done = False self.results = [] self.initial_epsilon = 1 self.final_epsilon = hyper_params['final_epsilon'] self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] self.replace_targe_cnt = 0 self.epsilon = 1 self.eval_models_seq = 1
def __init__(self, name): """ :param name: name of the rl_component """ # name of the rl_component self.name = name # True if the model was set up self.is_model_init = False # Service for communicating the activations self._get_activation_service = rospy.Service( name + 'GetActivation', GetActivation, self._get_activation_state_callback) # choose appropriate model self.model = DQNModel(self.name) # save the last state self.last_state = None # the dimensions of the model self.number_outputs = -1 self.number_inputs = -1 self._unregistered = False rospy.on_shutdown( self.unregister) # cleanup hook also for saving the model.
class DQN_server(): def __init__(self, env, hyper_params, action_space): #self.env = env #self.max_episode_steps = env._max_episode_steps """ beta: The discounted factor of Q-value function (epsilon): The explore or exploit policy epsilon. initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1 final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps', The epsilon set to the 'final_epsilon' determinately. epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'. """ self.beta = hyper_params['beta'] """ episode: Record training episode steps: Add 1 when predicting an action learning: The trigger of agent learning. It is on while training agent. It is off while testing agent. action_space: The action space of the current environment, e.g 2. """ # self.episode = 0 # self.steps = 0 # self.best_reward = 0 # self.learning = True # self.action_space = action_space """ input_len The input length of the neural network. It equals to the length of the state vector. output_len: The output length of the neural network. It is equal to the action space. eval_model: The model for predicting action for the agent. target_model: The model for calculating Q-value of next_state to update 'eval_model'. use_target_model: Trigger for turn 'target_model' on/off """ state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.use_target_model = hyper_params['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) # memory: Store and sample experience replay. #self.memory = ReplayBuffer(hyper_params['memory_size']) """ batch_size: Mini batch size for training model. update_steps: The frequence of traning model model_replace_freq: The frequence of replacing 'target_model' by 'eval_model' """ self.batch_size = hyper_params['batch_size'] #self.update_steps = hyper_params['update_steps'] #self.model_replace_freq = hyper_params['model_replace_freq'] print("server initialized") def replace_target_model(self): self.target_model.replace(self.eval_model) def eval_model_predict(self, state): return self.eval_model.predict(state) # This next function will be called in the main RL loop to update the neural network model given a batch of experience # 1) Sample a 'batch_size' batch of experiences from the memory. # 2) Predict the Q-value from the 'eval_model' based on (states, actions) # 3) Predict the Q-value from the 'target_model' base on (next_states), and take the max of each Q-value vector, Q_max # 4) If is_terminal == 1, q_target = reward + discounted factor * Q_max, otherwise, q_target = reward # 5) Call fit() to do the back-propagation for 'eval_model'. def update_batch(self, memory): current_memory_size = memory.get_current_size() if current_memory_size < self.batch_size: return #print("fetching minibatch from replay memory") batch = memory.sample(self.batch_size) (states, actions, reward, next_states, is_terminal) = batch states = states next_states = next_states terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.batch_size, dtype=torch.long) # Current Q Values _, q_values = self.eval_model.predict_batch(states) #q_values = q_values[np.arange(self.batch_size), actions] q_values = q_values[batch_index, actions] # Calculate target if self.use_target_model: #print("target_model.predict") best_actions, q_next = self.target_model.predict_batch(next_states) else: best_actions, q_next = self.eval_model.predict_batch(next_states) q_max = q_next[batch_index, best_actions] terminal = 1 - terminal q_max *= terminal q_target = reward + self.beta * q_max # update model self.eval_model.fit(q_values, q_target) # save model def save_model(self): self.eval_model.save(result_floder + '/best_model.pt') # load model def load_model(self): self.eval_model.load(result_floder + '/best_model.pt')
class DQN_server(object): def __init__(self, learning_rate, training_episodes, memory, env, test_interval=50, batch_size=32, action_space=len(ACTION_DICT), beta=0.99): self.env = env #self.max_episode_steps = env._max_episode_steps self.batch_num = training_episodes // test_interval self.steps = 0 self.collector_done = False self.evaluator_done = False self.training_episodes = training_episodes self.episode = 0 #self.esults = [] self.batch_size = batch_size self.privous_q_model = [] self.results = [0] * (self.batch_num + 1) self.result_count = 0 self.memory = memory self.use_target_model = True state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=0.0003) self.target_model = DQNModel(input_len, output_len) self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq'] def get_eval_model(self): print(self.episode) if self.episode >= self.training_episodes: self.collector_done = True return self.collector_done def add_episode(self): self.episode += 1 return self.episode def update_batch(self): if self.collector_done: return if ray.get(self.memory.__len__.remote() ) < self.batch_size or self.steps % self.update_steps != 0: return batch = ray.get(self.memory.sample.remote(self.batch_size)) (states, actions, reward, next_states, is_terminal) = batch self.steps += self.update_steps states = states next_states = next_states terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.batch_size, dtype=torch.long) # Current Q Values _, q_values = self.eval_model.predict_batch(states) q_values = q_values[batch_index, actions] # Calculate target if self.use_target_model: actions, q_next = self.target_model.predict_batch(next_states) else: actions, q_next = self.eval_model.predict_batch(next_states) #INSERT YOUR CODE HERE --- neet to compute 'q_targets' used below q_targets = [0] * self.batch_size for i in range(self.batch_size): if terminal[i] == 1: q_targets[i] = reward[i] else: max_value = torch.max(q_next, dim=1).values[i].data q_targets[i] = reward[i] + beta * max_value q_target = FloatTensor(q_targets) # update model self.eval_model.fit(q_values, q_target) if self.episode // test_interval + 1 > len(self.privous_q_model): model_id = ray.put(self.eval_model) self.privous_q_model.append(model_id) return self.steps # evalutor def add_result(self, result, num): #print(num) self.results[num] = result def get_results(self): return self.results def ask_evaluation(self): if len(self.privous_q_model) > self.result_count: num = self.result_count evluation_q_model = self.privous_q_model[num] self.result_count += 1 return evluation_q_model, False, num else: if self.episode >= self.training_episodes: self.evaluator_done = True return [], self.evaluator_done, None def replace(self): self.target_model.replace(self.eval_model) def predict(self, state): return self.eval_model.predict(state)
class RLComponent(object): """ The rl component as a class. functions as a bridge between manager and rl-algo. It can also be used in a separated node through its service interface. """ def __init__(self, name): """ :param name: name of the rl_component """ # name of the rl_component self.name = name # True if the model was set up self.is_model_init = False # Service for communicating the activations self._get_activation_service = rospy.Service( name + 'GetActivation', GetActivation, self._get_activation_state_callback) # choose appropriate model self.model = DQNModel(self.name) # save the last state self.last_state = None # the dimensions of the model self.number_outputs = -1 self.number_inputs = -1 self._unregistered = False rospy.on_shutdown( self.unregister) # cleanup hook also for saving the model. def _get_activation_state_callback(self, request_msg): """ answers the RL activation service and responds with the activations/reinforcements :param request_msg: GetActivation :return: Service Response """ input_state = request_msg.input_state negative_states = request_msg.negative_states try: activation_state = self.get_activation_state( input_state, negative_states) return GetActivationResponse(activation_state) except Exception as e: rhbplog.logerr(e.message) return None def get_activation_state(self, input_state, negative_states=None): """ Determine the activation/reinforcement for the given input states, save the state (combined with last state for training) :param input_state: :type input_state: InputState :param negative_states: :return: ActivationState """ if negative_states is None: negative_states = [] try: self.check_if_model_is_valid(input_state.num_inputs, input_state.num_outputs) if input_state.last_action: # only save state if we have a valid prior action. # save current input state self.save_state(input_state) # update the last state, which would also be the starting point for the negative states self.last_state = input_state.input_state # save negative states if available for state in negative_states: self.save_state(state, is_extra_state=True) # update the model self.model.train_model() # transform the input state and get activation transformed_input = numpy.array(input_state.input_state).reshape( ([1, len(input_state.input_state)])) activations = self.model.feed_forward(transformed_input) # return the activation via the service activations = activations.tolist()[0] activation_state = ActivationState( **{ "name": self. name, # this is sent for sanity check and planner status messages only "activations": activations, }) return activation_state except Exception as e: rhbplog.logerr(e.message) return None def save_state(self, input_state, is_extra_state=False): """ save the old_state,new_state,action,reward tuple for batch updating of the model :param input_state: current state input (positive or negative) :type input_state: InputState :param is_extra_state: set to True if this is a special extra state (e.g. negative states) that is recorded but not necessarily has been explored/executed """ if self.last_state is None: return last = numpy.array(self.last_state).reshape(([1, len(self.last_state)])) new = numpy.array(input_state.input_state).reshape( ([1, len(input_state.input_state)])) reward_tuple = (last, new, input_state.last_action, input_state.reward) self.model.add_sample(tuple=reward_tuple, consider_reward=not is_extra_state) def check_if_model_is_valid(self, num_inputs, num_outputs): """ checks if the in-/outputs are the same as the current model has. If not a new model is started :param num_inputs: :param num_outputs: :return: """ if not self.is_model_init: self.init_model(num_inputs, num_outputs) else: if (not self.number_outputs == num_outputs) or (not self.number_inputs == num_inputs): self.init_model(num_inputs, num_outputs) def init_model(self, num_inputs, num_outputs): """ inits the model with the specified parameters :param num_inputs: :param num_outputs: :return: """ self.number_inputs = num_inputs self.number_outputs = num_outputs self.last_state = None self.model.start_nn(num_inputs, num_outputs) self.is_model_init = True def unregister(self): if not self._unregistered: self._unregistered = True if self.model: self.model.save_model() def __del__(self): self.unregister()
class DQN_agent(object): def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)): self.env = env self.max_episode_steps = env._max_episode_steps self.beta = hyper_params['beta'] self.initial_epsilon = 1 self.final_epsilon = hyper_params['final_epsilon'] self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] self.episode = 0 self.steps = 0 self.best_reward = 0 self.learning = True self.action_space = action_space state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.use_target_model = hyper_params['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) self.memory = ReplayBuffer(hyper_params['memory_size']) self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq'] # Linear decrease function for epsilon def linear_decrease(self, initial_value, final_value, curr_steps, final_decay_steps): decay_rate = curr_steps / final_decay_steps if decay_rate > 1: decay_rate = 1 return initial_value - (initial_value - final_value) * decay_rate def explore_or_exploit_policy(self, state): p = uniform(0, 1) # Get decreased epsilon epsilon = self.linear_decrease(self.initial_epsilon, self.final_epsilon, self.steps, self.epsilon_decay_steps) if p < epsilon: # return action return randint(0, self.action_space - 1) else: # return action return self.greedy_policy(state) def greedy_policy(self, state): return self.eval_model.predict(state) def update_batch(self): pass def learn(self): pass
class RLAgent_model_server(): def __init__(self, env, hyper_params, memo_server): self.memory_server = memo_server self.env = env self.max_episode_steps = env._max_episode_steps self.beta = hyper_params['beta'] self.training_episodes = hyper_params['training_episodes'] self.test_interval = hyper_params['test_interval'] action_space = len(ACTION_DICT) self.episode = 0 self.steps = 0 self.best_reward = 0 self.learning = True self.action_space = action_space state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.use_target_model = hyper_params['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq'] self.collector_done = False self.results = [] self.initial_epsilon = 1 self.final_epsilon = hyper_params['final_epsilon'] self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] self.replace_targe_cnt = 0 self.epsilon = 1 self.eval_models_seq = 1 def update_batch(self): # Get memory sample batch = ray.get(self.memory_server.sample.remote(self.batch_size)) if not batch: return (states, actions, reward, next_states, is_terminal) = batch # Setting torch value states = states next_states = next_states terminal = FloatTensor([0 if t else 1 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.batch_size, dtype=torch.long) # Current Q Values _, q_values = self.eval_model.predict_batch(states) q_values = q_values[batch_index, actions] # Calculate target if self.use_target_model: actions, q_next = self.target_model.predict_batch(next_states) else: actions, q_next = self.eval_model.predict_batch(next_states) max_q_next, index = torch.max(q_next, dim=1) q_target = reward + self.beta * max_q_next * terminal # Update model self.eval_model.fit(q_values, q_target) def replace_target_model(self): if self.use_target_model and self.steps % self.model_replace_freq == 0: self.target_model.replace(self.eval_model) def evaluate_result(self): # print(self.episode, self.training_episodes) self.episode += 1 if self.episode % self.test_interval == 0: self.save_model() # evaluation_worker_gg.remote(self.env, self.memory_server, self.eval_model, self.test_interval) def save_model(self): filename = "/best_model{0}.pt".format(self.eval_models_seq) self.eval_model.save(result_floder + filename) self.memory_server.add_evamodel_dir.remote(result_floder + filename) self.eval_models_seq += 1 def ask_evaluate(self): if len(self.eval_models) == 0: return None, self.episode >= self.training_episodes eval_model, is_done = self.eval_models[0] del self.eval_models[0] return eval_model, is_done def get_collector_done(self): return self.episode >= self.training_episodes def linear_decrease(self, initial_value, final_value, curr_steps, final_decay_steps): decay_rate = curr_steps / final_decay_steps if decay_rate > 1: decay_rate = 1 return initial_value - (initial_value - final_value) * decay_rate def explore_or_exploit_policy(self, state): self.epsilon = self.linear_decrease(self.initial_epsilon, self.final_epsilon, self.steps, self.epsilon_decay_steps) return randint(0, self.action_space - 1) if uniform(0, 1) < self.epsilon else self.greedy_policy(state) def greedy_policy(self, state): return self.eval_model.predict(state) def add_results(self, result): self.results.append(result) def get_reuslts(self): return self.results def update_and_replace_model(self): self.steps += 1 if self.steps % self.update_steps != 0: self.update_batch() self.replace_target_model()
class DQN_agent(object): def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)): self.env = env self.max_episode_steps = env._max_episode_steps self.beta = hyper_params['beta'] self.initial_epsilon = 1 self.final_epsilon = hyper_params['final_epsilon'] self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] self.episode = 0 self.steps = 0 self.best_reward = 0 self.learning = True self.action_space = action_space state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.use_target_model = hyper_params['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) self.memory = ReplayBuffer(hyper_params['memory_size']) self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq'] # Linear decrease function for epsilon def linear_decrease(self, initial_value, final_value, curr_steps, final_decay_steps): decay_rate = curr_steps / final_decay_steps if decay_rate > 1: decay_rate = 1 return initial_value - (initial_value - final_value) * decay_rate def explore_or_exploit_policy(self, state): p = uniform(0, 1) # Get decreased epsilon epsilon = self.linear_decrease(self.initial_epsilon, self.final_epsilon, self.steps, self.epsilon_decay_steps) if p < epsilon: #return action return randint(0, self.action_space - 1) else: #return action return self.greedy_policy(state) def greedy_policy(self, state): return self.eval_model.predict(state) def update_batch(self): if len(self.memory ) < self.batch_size or self.steps % self.update_steps != 0: return # 1) Sample a 'batch_size' batch of experiences from the memory. batch = self.memory.sample(self.batch_size) (states, actions, reward, next_states, is_terminal) = batch states = states next_states = next_states terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.batch_size, dtype=torch.long) # Current Q Values --- 2) Predict the Q-value from the 'eval_model' based on (states, actions) _, q_values = self.eval_model.predict_batch(states) q_values = q_values[batch_index, actions] # Calculate target --- 3) Predict the Q-value from the 'target model' based on (next_states), and take max of each Q-value vector, Q_max if self.use_target_model: actions, q_next = self.target_model.predict_batch(next_states) else: actions, q_next = self.eval_model.predict_batch(next_states) q_next = q_next[batch_index, actions] q_target = FloatTensor([ reward[index] if is_terminal[index] else reward[index] + self.beta * q_next[index] for index in range(self.batch_size) ]) # update model self.eval_model.fit(q_values, q_target) def learn_and_evaluate(self, training_episodes, test_interval): test_number = training_episodes // test_interval all_results = [] for i in range(test_number): # learn self.learn(test_interval) # evaluate avg_reward = self.evaluate() all_results.append(avg_reward) return all_results def learn(self, test_interval): for episode in tqdm(range(test_interval), desc="Training"): state = self.env.reset() done = False steps = 0 while steps < self.max_episode_steps and not done: action = self.explore_or_exploit_policy(state) next_state, reward, done, _ = self.env.step(action) # Store history self.memory.add(state, action, reward, next_state, done) # Update the model if self.steps % self.update_steps == 0: self.update_batch() # Update the target network if DQN uses it if self.use_target_model: if self.steps % self.model_replace_freq == 0: self.target_model.replace(self.eval_model) # Update information for the next loop state = next_state steps += 1 self.steps += 1 def evaluate(self, trials=30): total_reward = 0 for _ in tqdm(range(trials), desc="Evaluating"): state = self.env.reset() done = False steps = 0 while steps < self.max_episode_steps and not done: steps += 1 action = self.greedy_policy(state) state, reward, done, _ = self.env.step(action) total_reward += reward avg_reward = total_reward / trials print(avg_reward) f = open(result_file, "a+") f.write(str(avg_reward) + "\n") f.close() if avg_reward >= self.best_reward: self.best_reward = avg_reward self.save_model() return avg_reward # save model def save_model(self): self.eval_model.save(result_floder + '/best_model.pt') # load model def load_model(self): self.eval_model.load(result_floder + '/best_model.pt')
def train_main(exp_prefix="", fc_units=[128, 64, 64], env_list=[], num_envs=10, num_obstacls_ratio=[0.2, 0.3, 0.3, 0.2], n_step=1, max_episodes=10000, max_steps=120, per_num_envs=8, replay_buffer_len=400, no_replay=False, batch_size=64, learning_rate=1e-4, epsilon_min=0.05, epsilon_max=0.10, gamma=0.98, without_map_info=False, save_interval=1000, show=False): # create envs if len(env_list) == 0: env_list = create_or_load_envs(num_envs, num_obstacls_ratio) # create model if without_map_info: state_dims = 2 + 1 else: state_dims = 4 * (2 + 2) + 6 + 2 + 2 act_dims = 5 model = DQNModel(state_dims=state_dims, act_dims=act_dims, fc_units=fc_units) print("create model done") # optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate) # create replay buffer buffer = ReplayBuffer(replay_buffer_len) print("create buffer done") # construct save path suffix weight_dir = os.path.join("weights", exp_prefix) dir_util.mkpath(weight_dir) log_dir = os.path.join("logs", exp_prefix) dir_util.mkpath(log_dir) summary_writer = tf.summary.create_file_writer(log_dir) # run simulations mean_loss_vals = [] mean_ep_rewards = [] last_save_ep_idx = 0 for ep in range(max_episodes // per_num_envs): if no_replay: buffer.clear() num_new_samples = 0 ep_rewards = [] # randomly select an env and run rollout envs = np.random.choice(env_list, size=(per_num_envs)) env_indices = np.random.randint(len(env_list), size=(per_num_envs)) for roll_idx, env_idx in enumerate(env_indices): env = env_list[env_idx] episode_index = ep * per_num_envs + roll_idx epsilon = epsilon_max - ( epsilon_max - epsilon_min) / max_episodes * episode_index ship_state_trace, input_states, action_list, reward_list, done_list, is_random_act_list, qvals = run_one_episodes( env, model, epsilon, max_steps, without_map_info) # td_errors = (reward_list + qvals[1:] * gamma) - qvals[:-1] td_errors = get_n_step_estimated_qvals(reward_list, qvals[1:], gamma, n_step) - qvals[:-1] buffer.add_items(input_states, action_list, reward_list, done_list, td_errors) num_new_samples += len(input_states) ep_rewards.append(np.sum(reward_list)) print( "episode {:4d}, env-{:03d}, epsilon: {:4.2f}, episode length: {:3d}, ep_reward: {:8.2f}" .format(episode_index, env_idx, epsilon, len(input_states), np.sum(reward_list))) tot_ep_reward = np.sum(reward_list) avg_ep_reward = np.mean(reward_list) with summary_writer.as_default(): tf.summary.scalar('tot_ep_reward_trn', tot_ep_reward, step=episode_index) tf.summary.scalar('avg_ep_reward_trn', avg_ep_reward, step=episode_index) if episode_index % 100 == 0: # run an evaluation (eval_ship_state_trace, eval_input_states, eval_action_list, eval_reward_list, eval_done_list, eval_is_random_act_list, eval_qval_list) = run_one_episodes(env, model, 0, max_steps, without_map_info) # log episode reward with summary_writer.as_default(): eval_tot_ep_reward = np.sum(eval_reward_list) eval_avg_ep_reward = np.mean(eval_reward_list) tf.summary.scalar('tot_ep_reward_evl', eval_tot_ep_reward, step=episode_index) tf.summary.scalar('avg_ep_reward_evl', eval_avg_ep_reward, step=episode_index) # eval the loss eval_states_curr = np.array(eval_input_states[:-1]) eval_states_next = np.array(eval_input_states[1:]) eval_qvals_next = model(eval_states_next, training=False).numpy() eval_qvals_next_max = np.amax( eval_qvals_next, axis=1) * (1 - np.array(eval_done_list)) eval_qvals_esti = get_n_step_estimated_qvals( eval_reward_list, eval_qvals_next_max, gamma, n_step) # to tensor eval_states_curr = tf.convert_to_tensor( eval_states_curr, tf.float32) eval_action_list_tf = tf.convert_to_tensor(eval_action_list) eval_qvals_esti = tf.convert_to_tensor(eval_qvals_esti, tf.float32) # eval to get loss eval_loss = eval_step_v0(model, eval_states_curr, eval_action_list_tf, eval_qvals_esti).numpy() with summary_writer.as_default(): tf.summary.scalar('loss_evl', eval_loss, step=episode_index) # draw map and state trace env.show(eval_ship_state_trace, np.sum(eval_reward_list), eval_loss, eval_action_list, eval_is_random_act_list, save_path="pictures", prefix=exp_prefix, count=episode_index) # run update avg_ep_reward = float(np.mean(ep_rewards)) mean_ep_rewards.append(avg_ep_reward) curr_update_loss_vals = [] if no_replay: num_updates = 1 else: num_updates = max( 1, min(num_new_samples, replay_buffer_len) // batch_size) for _ in range(num_updates): # get qvals of next states if no_replay: batch_size = max(1, int(num_new_samples * 0.8)) # overwrite batch_size states_curr, states_next, actions, rewards, dones = buffer.sample( batch_size) states_next = tf.convert_to_tensor(states_next, tf.float32) qvals_next = model(states_next, training=False).numpy() qvals_next = np.amax(qvals_next, axis=1) * (1 - dones) qvals_esti = get_n_step_estimated_qvals(rewards, qvals_next, gamma, n_step) # to tensor states_curr = tf.convert_to_tensor(states_curr, tf.float32) actions = tf.convert_to_tensor(actions) qvals_esti = tf.convert_to_tensor(qvals_esti, tf.float32) # do an update loss_trn = train_step_v0(model, optimizer, states_curr, actions, qvals_esti).numpy() with summary_writer.as_default(): tf.summary.scalar('loss_trn', loss_trn, step=episode_index) curr_update_loss_vals.append(loss_trn) print("episode {:4d}, bs: {:4d}, loss_trn: {:6.2f}".format( episode_index, batch_size, loss_trn)) mean_loss_vals.append(float(np.mean(curr_update_loss_vals))) # draw loss if ep > 0 and ep % 10 == 0: draw_vals(mean_ep_rewards, mean_loss_vals, per_num_envs, exp_prefix=exp_prefix) # save to file for further use json.dump([mean_loss_vals, mean_ep_rewards], open("logs/{}_logs_info.json".format(exp_prefix), "w")) # Save the weights using the `checkpoint_path` format if (episode_index - last_save_ep_idx) > save_interval: save_path = os.path.join( weight_dir, "weights_{:05d}.ckpt".format(episode_index)) model.save_weights(save_path) last_save_ep_idx = episode_index print("episode-{}, save weights to: {}".format( episode_index, save_path))
class DQN_Model_Server(): def __init__(self, env, hyper_params, batch_size, update_steps, memory_size, beta, model_replace_freq, learning_rate, use_target_model=True, memory=Memory_Server, action_space=2, training_episodes=7000, test_interval=50): # super().__init__(update_steps, memory_size, model_replace_freq, learning_rate, beta=0.99, batch_size = 32, use_target_model=True) self.batch_size = batch_size state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=0.0003) self.target_model = DQNModel(input_len, output_len) self.steps = 0 self.memory = memory # self.memory = ReplayBuffer(hyper_params['memory_size']) self.prev = 0 self.next = 0 self.model_dq = deque() self.result = [0] * ((training_episodes // test_interval) + 1) self.previous_q_networks = [] self.result_count = 0 self.learning_episodes = training_episodes self.episode = 0 self.is_collection_completed = False self.evaluator_done = False self.batch_num = training_episodes // test_interval self.use_target_model = True self.beta = 0.99 self.test_interval = test_interval def get_evaluation_model(self): if self.episode >= self.learning_episodes: self.is_collection_completed = True return self.is_collection_completed def replace(self): self.target_model.replace(self.eval_model) def get_total_steps(self): return self.steps def predict_next(self, state, e_model): return e_model.predict(state) def get_predict(self, state): return self.eval_model.predict(state) def set_collect_count(self): self.next += 1 def set_collector_count(self): self.episode += 1 def get_evaluation_count(self): return self.result_count def get_evaluator_count(self): return self.episode def ask_evaluation(self): if len(self.previous_q_networks) > self.result_count: num = self.result_count evluation_q_network = self.previous_q_networks[num] self.result_count += 1 self.episode += 50 return evluation_q_network, False, num else: if self.episode >= self.learning_episodes: self.evaluator_done = True return [], self.evaluator_done, None def update_batch(self): self.steps += 10 if ray.get(self.memory.__len__.remote()) < self.batch_size: # or self.steps % self.update_steps != 0: return if self.is_collection_completed: return batch = ray.get(self.memory.sample.remote(self.batch_size)) (states, actions, reward, next_states, is_terminal) = batch states = states next_states = next_states terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.batch_size, dtype=torch.long) _, q_values = self.eval_model.predict_batch(states) q_values = q_values[batch_index, actions] if self.use_target_model: actions, q_next = self.target_model.predict_batch(next_states) else: actions, q_next = self.eval_model.predict_batch(next_states)#dont need though q_targets = [] for i in range(0, len(terminal), 1): if terminal[i] == 1: q_targets.append(reward[i]) else: q_targets.append(reward[i] + (self.beta * torch.max(q_next, 1).values[i].data)) q_target = FloatTensor(q_targets) self.eval_model.fit(q_values, q_target) if self.episode // self.test_interval + 1 > len(self.previous_q_networks): model_id = ray.put(self.eval_model) self.previous_q_networks.append(model_id) return self.steps def set_results(self, result, num): self.result[num] = result def get_results(self): return self.result
class DQN_agent(object): def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)): self.env = env self.max_episode_steps = env._max_episode_steps """ beta: The discounted factor of Q-value function (epsilon): The explore or exploit policy epsilon. initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1 final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps', The epsilon set to the 'final_epsilon' determinately. epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'. """ self.beta = hyper_params['beta'] self.initial_epsilon = 1 self.final_epsilon = hyper_params['final_epsilon'] self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] """ episode: Record training episode steps: Add 1 when predicting an action learning: The trigger of agent learning. It is on while training agent. It is off while testing agent. action_space: The action space of the current environment, e.g 2. """ self.episode = 0 self.steps = 0 self.best_reward = 0 self.learning = True self.action_space = action_space """ input_len The input length of the neural network. It equals to the length of the state vector. output_len: The output length of the neural network. It is equal to the action space. eval_model: The model for predicting action for the agent. target_model: The model for calculating Q-value of next_state to update 'eval_model'. use_target_model: Trigger for turn 'target_model' on/off """ state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.use_target_model = hyper_params['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) # memory: Store and sample experience replay. self.memory = ReplayBuffer(hyper_params['memory_size']) """ batch_size: Mini batch size for training model. update_steps: The frequence of traning model model_replace_freq: The frequence of replacing 'target_model' by 'eval_model' """ self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq'] print("agent initialized") # Linear decrease function for epsilon def linear_decrease(self, initial_value, final_value, curr_steps, final_decay_steps): decay_rate = curr_steps / final_decay_steps if decay_rate > 1: decay_rate = 1 return initial_value - (initial_value - final_value) * decay_rate def explore_or_exploit_policy(self, state): p = uniform(0, 1) # Get decreased epsilon epsilon = self.linear_decrease(self.initial_epsilon, self.final_epsilon, self.steps, self.epsilon_decay_steps) #if(np.random.randint(1000)==4): #print("epsilon",epsilon) if p < epsilon: #return action return randint(0, self.action_space - 1) else: #return action return self.greedy_policy(state) def greedy_policy(self, state): return self.eval_model.predict(state) # This next function will be called in the main RL loop to update the neural network model given a batch of experience # 1) Sample a 'batch_size' batch of experiences from the memory. # 2) Predict the Q-value from the 'eval_model' based on (states, actions) # 3) Predict the Q-value from the 'target_model' base on (next_states), and take the max of each Q-value vector, Q_max # 4) If is_terminal == 1, q_target = reward + discounted factor * Q_max, otherwise, q_target = reward # 5) Call fit() to do the back-propagation for 'eval_model'. def update_batch(self): if len(self.memory ) < self.batch_size or self.steps % self.update_steps != 0: return #print("fetching minibatch from replay memory") batch = self.memory.sample(self.batch_size) (states, actions, reward, next_states, is_terminal) = batch states = states next_states = next_states terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.batch_size, dtype=torch.long) # Current Q Values _, q_values = self.eval_model.predict_batch(states) #q_values = q_values[np.arange(self.batch_size), actions] q_values = q_values[batch_index, actions] # Calculate target if self.use_target_model: #print("target_model.predict") best_actions, q_next = self.target_model.predict_batch(next_states) else: best_actions, q_next = self.eval_model.predict_batch(next_states) q_max = q_next[batch_index, best_actions] terminal = 1 - terminal q_max *= terminal q_target = reward + self.beta * q_max # update model self.eval_model.fit(q_values, q_target) def learn_and_evaluate(self, training_episodes, test_interval): test_number = training_episodes // test_interval all_results = [] for i in range(test_number): # learn self.learn(test_interval) # evaluate avg_reward = self.evaluate() all_results.append(avg_reward) return all_results def learn(self, test_interval): for episode in tqdm(range(test_interval), desc="Training"): state = self.env.reset() done = False steps = 0 while steps < self.max_episode_steps and not done: #INSERT YOUR CODE HERE # add experience from explore-exploit policy to memory action = self.explore_or_exploit_policy(state) next_state, reward, done, info = self.env.step(action) self.memory.add(state, action, reward, next_state, done) # update the model every 'update_steps' of experience self.update_batch() # update the target network (if the target network is being used) every 'model_replace_freq' of experiences if self.use_target_model and (self.steps % self.model_replace_freq == 0): self.target_model.replace(self.eval_model) self.steps += 1 steps += 1 state = next_state def evaluate(self, trials=30): total_reward = 0 for _ in tqdm(range(trials), desc="Evaluating"): state = self.env.reset() done = False steps = 0 while steps < self.max_episode_steps and not done: steps += 1 action = self.greedy_policy(state) state, reward, done, _ = self.env.step(action) total_reward += reward avg_reward = total_reward / trials print(avg_reward) f = open(result_file, "a+") f.write(str(avg_reward) + "\n") f.close() if avg_reward >= self.best_reward: self.best_reward = avg_reward self.save_model() return avg_reward # save model def save_model(self): self.eval_model.save(result_floder + '/best_model.pt') # load model def load_model(self): self.eval_model.load(result_floder + '/best_model.pt')
for col in range(3): if env.board[row, col] == 0: s += (str(row * 3 + col)) elif env.board[row, col] == 1: s += 'X' else: s += 'O' if col < 2: s += '|' print(s) e = TTT() e.reset() model = DQNModel.load('awesome2.pb') is_finished = False output = None winner = None while not is_finished: print_board(e) user_input = input('\nChoose your move ') output = e.step(int(user_input)) if not is_finished: predicted_qs = model.predict(e.board)[0] for index, q in enumerate(predicted_qs): print('{}: {}'.format(index, q)) ai_action = model.get_top_action(e.board) output = e.step(ai_action)
class DQN_model_server(object): def __init__(self, env, memory, action_space=2, test_interval=50): self.collector_done = False self.evaluator_done = False self.env = env # self.max_episode_steps = env._max_episode_steps self.max_episode_steps = 200 self.beta = hyperparams_CartPole['beta'] self.initial_epsilon = 1 self.final_epsilon = hyperparams_CartPole['final_epsilon'] self.epsilon_decay_steps = hyperparams_CartPole['epsilon_decay_steps'] self.batch_size = hyperparams_CartPole['batch_size'] self.episode = 0 self.steps = 0 self.best_reward = 0 self.learning = True self.action_space = action_space self.previous_q_models = [] self.results = [0] * (self.batch_size + 1) self.reuslt_count = 0 self.episode = 0 self.test_interval = test_interval self.memory = memory state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyperparams_CartPole['learning_rate']) self.use_target_model = hyperparams_CartPole['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) # # memory: Store and sample experience replay. # self.memory = ReplayBuffer(hyper_params['memory_size']) self.batch_size = hyperparams_CartPole['batch_size'] self.update_steps = hyperparams_CartPole['update_steps'] self.model_replace_freq = hyperparams_CartPole['model_replace_freq'] def get_steps(self): return self.steps def update_batch(self): # if len(memory) < self.batch_size or self.steps % self.update_steps != 0: # return # print(len(self.memory.remote())) batch = self.memory.sample.remote(self.batch_size) (states, actions, reward, next_states, is_terminal) = ray.get(batch) states = states next_states = next_states terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.batch_size, dtype=torch.long) # Current Q Values _, self.q_values = self.eval_model.predict_batch(states) self.q_values = self.q_values[batch_index, actions] # Calculate target if self.use_target_model: actions, self.q_next = self.target_model.predict_batch(next_states) self.q_next = self.q_next[batch_index, actions] else: actions, self.q_next = self.eval_model.predict_batch(next_states) self.q_next = self.q_next[batch_index, actions] # INSERT YOUR CODE HERE --- neet to compute 'q_targets' used below self.q_target = [] for i in range(len(reward)): if terminal[i] == 1: self.q_target.append(reward[i]) else: self.q_target.append(reward[i] + self.beta * self.q_next[i]) self.q_target = FloatTensor(self.q_target) # update model self.eval_model.fit(self.q_values, self.q_target) if(np.random.randint(100)==4): print("==========",self.q_values[0],self.q_target[0]) # print("..................................................", self.evaluate()) # score = self.evaluate() # f_results = open("./results_8_4.txt", "a+") # f_results.write(str(score) + "\n") # f_results.close() if self.episode // self.test_interval + 1 > len(self.previous_q_models): model_id = ray.put(self.eval_model) self.previous_q_models.append(model_id) self.steps += 10 return self.steps def greedy_policy(self, state): return self.eval_model.predict(state) def replace_target(self): return self.target_model.replace(self.eval_model) # evalutor # def add_result(self, result): # self.results[num] = result def get_reuslts(self): return self.results def ask_evaluation(self): if len(self.previous_q_models) > self.reuslt_count: num = self.reuslt_count evluation_q_model = self.previous_q_models[num] self.reuslt_count += 1 return evluation_q_model, False, num else: if self.episode >= training_episodes: self.evaluator_done = True return [], self.evaluator_done, None def add_episode(self): self.episode += 1
def train(environment, starting_model_path=None, episodes=15000): if starting_model_path: policy_model = DQNModel.load(starting_model_path) target_model = DQNModel.load(starting_model_path) print('loaded model {}'.format(starting_model_path)) else: print('starting model from scratch') policy_model = DQNModel() target_model = DQNModel() target_model.set_weights(policy_model.get_weights()) print('Begin training...') replay_memory = [] epsilon = 0.0 for episode_i in range(episodes): replay_memory += play_out_episode(policy_model, environment, epsilon) replay_memory = replay_memory[-hparams['max_mem_size']:] epsilon = max(hparams['min_epsilon'], epsilon*hparams['epsilon_decay']) if len(replay_memory) >= hparams['min_mem_size']: do_training_step(policy_model, target_model, random.sample(replay_memory, hparams['batch_size'])) if episode_i % hparams['target_model_update_every'] == 0: target_model.set_weights(policy_model.get_weights()) if episode_i % hparams['evaluation_every'] == 0: info = evaluate_model(policy_model, environment) print('===================== episode {}, epsilon {}'.format(episode_i, epsilon)) print(info) print('======================================') policy_model.save('checkpoint-{}'.format(episode_i))
class Model_Server(): def __init__(self, env, hyper_params, memory, action_space): self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] self.final_epsilon = hyper_params['final_epsilon'] self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.beta = hyper_params['beta'] self.model_replace_freq = hyper_params['model_replace_freq'] self.learning_rate = hyper_params['learning_rate'] self.training_episodes = hyper_params['training_episodes'] self.test_interval = hyper_params['test_interval'] self.memory = memory self.episode = 0 self.steps = 0 self.result_count = 0 self.next = 0 self.batch_num = self.training_episodes // self.test_interval state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.target_model = DQNModel(input_len, output_len) self.results = [0] * (self.batch_num + 1) self.previous_q_networks = [] self.collector_done = False self.evaluator_done = False def ask_evaluation(self): if len(self.previous_q_networks) > self.result_count: num = self.result_count evaluation_q_network = self.previous_q_networks[num] self.result_count += 1 return evaluation_q_network, False, num else: if self.episode >= self.training_episodes: self.evaluator_done = True return [], self.evaluator_done, None def get_evaluation_model(self): if self.episode >= self.training_episodes: self.collector_done = True return self.eval_model, self.collector_done def replace_with_eval_model(self): self.target_model.replace(self.eval_model) def get_model_steps(self): return self.steps def predict_next_eval(self, state, eval_model): return eval_model.predict(state) def get_predict(self, state): return self.eval_model.predict(state) def increment_episode(self): self.episode += 1 def increment_model_steps(self): self.steps += 1 return self.steps def update_batch(self): self.steps += self.update_steps if ray.get(self.memory.__len__.remote()) < self.batch_size: # or self.steps % self.update_steps != 0: return if self.collector_done: return batch = ray.get(self.memory.sample.remote(self.batch_size)) (states, actions, reward, next_states, is_terminal) = batch states = states next_states = next_states terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.batch_size, dtype=torch.long) # Current Q Values _, q_values = self.eval_model.predict_batch(states) q_values = q_values[batch_index, actions] # Calculate target actions, q_next = self.target_model.predict_batch(next_states) q_max, indices = torch.max(q_next, dim=1) # INSERT YOUR CODE HERE --- neet to compute 'q_targets' used below q_targets = [] for i, is_term in enumerate(terminal): if is_term == 1: q_targets.append(reward[i]) else: q_targets.append(reward[i] + self.beta * q_max[i]) q_targets_tensor = FloatTensor(q_targets) # update model self.eval_model.fit(q_values, q_targets_tensor) if self.episode // self.test_interval + 1 > len(self.previous_q_networks): model_id = ray.put(self.eval_model) self.previous_q_networks.append(model_id) return self.steps def add_result(self, reward, num): self.results[num] = reward def get_results(self): return self.results
class ModelServer(): def __init__(self, hyper_params, memory_server, nb_agents, nb_evaluators, action_space=len(ACTION_DICT)): self.beta = hyper_params['beta'] self.initial_epsilon = 1 self.final_epsilon = hyper_params['final_epsilon'] self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] self.hyper_params = hyper_params self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq'] self.action_space = action_space self.batch_size = hyper_params['batch_size'] self.memory_server = memory_server self.nb_agents = nb_agents self.nb_evaluators = nb_evaluators env = CartPoleEnv() state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.target_model = DQNModel(input_len, output_len) self.agents = [ DQN_agent_remote.remote(CartPoleEnv(), memory_server, hyper_params, action_space, i) for i in range(nb_agents) ] self.evaluators = [ EvalWorker.remote(self.eval_model, CartPoleEnv(), hyper_params['max_episode_steps'], hyper_params['eval_trials'], i) for i in range(nb_evaluators) ] # Linear decrease function for epsilon def linear_decrease(self, initial_value, final_value, curr_steps, final_decay_steps): decay_rate = curr_steps / final_decay_steps if decay_rate > 1: decay_rate = 1 return initial_value - (initial_value - final_value) * decay_rate def update_batch(self): batch = self.memory_server.sample.remote(self.batch_size) (states, actions, reward, next_states, is_terminal) = ray.get(batch) if len(states) < self.batch_size: return nonterminal_x_beta = FloatTensor( [0 if t else self.beta for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.batch_size, dtype=torch.long) # Current Q Values _, q_values = self.eval_model.predict_batch(states) q_values = q_values[batch_index, actions] # Calculate target actions, q_next = self.target_model.predict_batch(next_states) q_targets = reward + nonterminal_x_beta * torch.max(q_next, 1).values # update model self.eval_model.fit(q_values, q_targets) def learn(self, test_interval, epsilon): # determine which collectors are idle ready_ids, _ = ray.wait( [agent.pingback.remote() for agent in self.agents], num_returns=1) ready_agents = ray.get(ready_ids) # send eval model to idle collectors, initiate collection for agent_id in ready_agents: self.agents[agent_id].collect.remote(self.eval_model, test_interval, epsilon) def evaluate(self, all_results): # determine which evaluators are idle ready_ids, _ = ray.wait( [evaluator.pingback.remote() for evaluator in self.evaluators], num_returns=1) ready_evaluators = ray.get(ready_ids) # send eval model to idle evaluators, get results for evaluator_id in ready_evaluators: avg_reward = ray.get( self.evaluators[evaluator_id].evaluate.remote()) all_results.append(avg_reward) def learn_and_evaluate(self, training_episodes, test_interval): test_number = training_episodes // test_interval all_results = [] for i in range(test_number): self.steps = i * test_interval # Get decreased epsilon epsilon = self.linear_decrease(self.initial_epsilon, self.final_epsilon, self.steps, self.epsilon_decay_steps) # send eval model to collectors, have them collect experience self.learn(test_interval, epsilon) # sample experience from memory server, perform batch update on eval model if self.steps % self.update_steps == 0: self.update_batch() # replace target model if self.steps % self.model_replace_freq == 0: self.target_model.replace(self.eval_model) # send eval model to evaluators, record results self.evaluate(all_results) return all_results