state = tetris.get_state() episodeReward = 0 step = 0 while step < maxStepPerEpisode: # if render and numSteps % renderStepDuration == 0: # image = tetris.get_printed_state() # plt.imshow(image) # plt.savefig(episodeDirectory + "s%d.jpg" % step) action = select_action(state) step += 1 next_state, reward, done = tetris.step(action) if done: next_state = None memory.add((state, action, reward, next_state)) state = next_state episodeReward += reward train() numSteps += 1 if numSteps % numStepPerUpdate == 0: targetNet.load_state_dict(policyNet.state_dict()) if done: break # if render: # image = tetris.get_printed_state() # plt.imshow(image)
def train_dqn(env, num_steps, *, replay_size, batch_size, exploration, gamma, train_freq=1, print_freq=100, target_network_update_freq=500, t_learning_start=1000): """ DQN algorithm. Compared to previous training procedures, we will train for a given number of time-steps rather than a given number of episodes. The number of time-steps will be in the range of millions, which still results in many episodes being executed. Args: - env: The openai Gym environment - num_steps: Total number of steps to be used for training - replay_size: Maximum size of the ReplayMemory - batch_size: Number of experiences in a batch - exploration: a ExponentialSchedule - gamma: The discount factor Returns: (saved_models, returns) - saved_models: Dictionary whose values are trained DQN models - returns: Numpy array containing the return of each training episode - lengths: Numpy array containing the length of each training episode - losses: Numpy array containing the loss of each training batch """ # check that environment states are compatible with our DQN representation assert (isinstance(env.observation_space, gym.spaces.Box) and len(env.observation_space.shape) == 1) # get the state_size from the environment state_size = env.observation_space.shape[0] # initialize the DQN and DQN-target models dqn_model = DQN(state_size, env.action_space.n) dqn_target = DQN.custom_load(dqn_model.custom_dump()) # initialize the optimizer optimizer = torch.optim.Adam(dqn_model.parameters(), lr=5e-4) # initialize the replay memory memory = ReplayMemory(replay_size, state_size) # initiate lists to store returns, lengths and losses rewards = [] returns = [] lengths = [] losses = [] last_100_returns = deque(maxlen=100) last_100_lengths = deque(maxlen=100) # initiate structures to store the models at different stages of training saved_models = {} i_episode = 0 t_episode = 0 state = env.reset() # iterate for a total of `num_steps` steps for t_total in range(num_steps): # use t_total to indicate the time-step from the beginning of training if t_total >= t_learning_start: eps = exploration.value(t_total - t_learning_start) else: eps = 1.0 action = select_action_epsilon_greedy(dqn_model, state, eps, env) next_state, reward, done, _ = env.step(action) memory.add(state, action, reward, next_state, done) rewards.append(reward) state = next_state if t_total >= t_learning_start and t_total % train_freq == 0: batch = memory.sample(batch_size) loss = train_dqn_batch(optimizer, batch, dqn_model, dqn_target, gamma) losses.append(loss) # update target network if t_total >= t_learning_start and t_total % target_network_update_freq == 0: dqn_target.load_state_dict(dqn_model.state_dict()) if done: # Calculate episode returns G = 0 for i in range(len(rewards)): G += rewards[i] * pow(gamma, i) # Collect results lengths.append(t_episode + 1) returns.append(G) last_100_returns.append(G) last_100_lengths.append(t_episode + 1) if i_episode % print_freq == 0: logger.record_tabular("time step", t_total) logger.record_tabular("episodes", i_episode) logger.record_tabular("step", t_episode + 1) logger.record_tabular("return", G) logger.record_tabular("mean reward", np.mean(last_100_returns)) logger.record_tabular("mean length", np.mean(last_100_lengths)) logger.record_tabular("% time spent exploring", int(100 * eps)) logger.dump_tabular() # End of episode so reset time, reset rewards list t_episode = 0 rewards = [] # Environment terminated so reset it state = env.reset() # Increment the episode index i_episode += 1 else: t_episode += 1 return ( dqn_model, np.array(returns), np.array(lengths), np.array(losses), )
class Learner(object): def __init__(self, params, param_set_id, status_dict, shared_state, remote_mem): self.params = params self.param_set_id = param_set_id self.status_dict = status_dict self.shared_state = shared_state self.remote_mem = remote_mem gpu = 0 torch.cuda.set_device(gpu) ep = params['env'] ap = params['actor'] lp = params['learner'] rmp = params["replay_memory"] model_formula = f'model.{lp["model"]}(self.state_shape, self.action_dim).to(self.device)' optimizer_formula = lp["optimizer"].format('self.Q.parameters()') self.conn = psycopg2.connect(params["db"]["connection_string"]) self.conn.autocommit = True self.cur = self.conn.cursor() self.device = torch.device("cuda:{}".format(gpu) if 0 <= gpu and torch.cuda.is_available() else "cpu") self.state_shape = ep['state_shape'] self.batch_size = lp['replay_sample_size'] self.action_dim = ep['action_dim'] self.q_target_sync_freq = lp['q_target_sync_freq'] self.num_q_updates = 0 self.take_offsets = (torch.arange(self.batch_size) * self.action_dim).to(self.device) self.Q = eval(model_formula) self.Q_target = eval(model_formula) # Target Q network which is slow moving replica of self.Q self.optimizer = eval(optimizer_formula) self.replay_memory = ReplayMemory(rmp) self.train_num = 0 self.model_file_name = lp['load_saved_state'] if self.model_file_name and os.path.isfile(self.model_file_name): print(f'Loading {self.model_file_name}') saved_state = torch.load(self.model_file_name) self.Q.load_state_dict(saved_state['module']) self.optimizer.load_state_dict(saved_state['optimizer']) self.train_num = saved_state['train_num'] self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu( self.Q_target.state_dict()) self.status_dict['Q_state_dict_stored'] = True self.last_Q_state_dict_id = 1 self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id self.status_dict['train_num'] = self.train_num self.gamma_n = params['actor']['gamma']**params['actor']['num_steps'] def state_dict_to_cpu(self, state_dict): d = OrderedDict() for k, v in state_dict.items(): d[k] = v.cpu() return d def add_experience_to_replay_mem(self): while self.remote_mem.qsize(): priorities, batch = self.remote_mem.get() self.replay_memory.add(priorities, batch) def compute_loss_and_priorities(self, batch_size): indices, n_step_transition_batch, before_priorities = self.replay_memory.sample(batch_size) s = n_step_transition_batch[0].to(self.device) a = n_step_transition_batch[1].to(self.device) r = n_step_transition_batch[2].to(self.device) a_latest = n_step_transition_batch[3].to(self.device) s_latest = n_step_transition_batch[4].to(self.device) terminal = n_step_transition_batch[5].to(self.device) q = self.Q(s) q_a = q.take(self.take_offsets + a).squeeze() with torch.no_grad(): self.Q_target.eval() Gt = r + (1.0 - terminal) * self.gamma_n * self.Q_target(s_latest).take(self.take_offsets + a_latest).squeeze() td_error = Gt - q_a loss = F.smooth_l1_loss(q_a, Gt) # loss = td_error**2 / 2 # Compute the new priorities of the experience after_priorities = td_error.data.abs().cpu().numpy() self.replay_memory.set_priorities(indices, after_priorities) return loss, q, before_priorities, after_priorities, indices def update_Q(self, loss): self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.num_q_updates += 1 if self.num_q_updates % self.q_target_sync_freq == 0: self.Q_target.load_state_dict(self.Q.state_dict()) print(f'Target Q synchronized.') return True else: return False def learn(self): t = tables.LearnerData() record_type = t.get_record_type() record_insert = t.get_insert() cur = self.cur param_set_id = self.param_set_id now = datetime.datetime.now step_num = 0 target_sync_num = 0 send_param_num = 0 min_replay_mem_size = self.params['learner']["min_replay_mem_size"] print('learner waiting for replay memory.') while self.replay_memory.size() <= min_replay_mem_size: self.add_experience_to_replay_mem() time.sleep(0.01) step_num = 0 print('learner start') while not self.status_dict['quit']: self.add_experience_to_replay_mem() # 4. Sample a prioritized batch of transitions # 5. & 7. Apply double-Q learning rule, compute loss and experience priorities # 8. Update priorities loss, q, before_priorities, after_priorities, indices = self.compute_loss_and_priorities(self.batch_size) if step_num % 10 == 0: print(f'loss : {loss}') #print("\nLearner: step_num=", step_num, "loss:", loss, "RPM.size:", self.replay_memory.size(), end='\r') # 6. Update parameters of the Q network(s) if self.update_Q(loss): target_sync_num += 1 if step_num % 5 == 0: self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu( self.Q_target.state_dict()) self.last_Q_state_dict_id += 1 self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id print('Send params to actors.') send_param_num += 1 # 9. Periodically remove old experience from replay memory step_num += 1 self.train_num += 1 self.status_dict['train_num'] = self.train_num # DBへデータ登録 r = record_type(param_set_id, now(), self.train_num, step_num, loss.item(), q[0].tolist(), before_priorities.tolist(), after_priorities.tolist(), indices.tolist(), target_sync_num, send_param_num) record_insert(cur, r) print('learner end') state_dict = {'module': self.Q.state_dict(), 'optimizer': self.optimizer.state_dict(), 'train_num': self.train_num} torch.save(state_dict, self.model_file_name)
def train(sess, environment, actor, critic, embeddings, history_length, ra_length, buffer_size, batch_size, discount_factor, nb_episodes, filename_summary, nb_rounds, **env_args): ''' Algorithm 3 in article. ''' # Set up summary operators def build_summaries(): episode_reward = tf.Variable (0.) tf.summary.scalar ('reward', episode_reward) episode_max_Q = tf.Variable (0.) tf.summary.scalar ('max_Q_value', episode_max_Q) critic_loss = tf.Variable (0.) tf.summary.scalar ('critic_loss', critic_loss) summary_vars = [episode_reward, episode_max_Q, critic_loss] summary_ops = tf.summary.merge_all () return summary_ops, summary_vars summary_ops, summary_vars = build_summaries () sess.run (tf.global_variables_initializer ()) writer = tf.summary.FileWriter (filename_summary, sess.graph) # '2: Initialize target network f′ and Q′' actor.init_target_network () critic.init_target_network () # '3: Initialize the capacity of replay memory D' replay_memory = ReplayMemory(buffer_size) # Memory D in article replay = False start_time = time.time () for i_session in range (nb_episodes): # '4: for session = 1, M do' session_reward = 0 session_Q_value = 0 session_critic_loss = 0 # '5: Reset the item space I' is useless because unchanged. nb_env = 10 envs = np.asarray([Environment(**env_args) for i in range(nb_env)]) # u = [e.current_user for e in envs] # print(u) # input() states = np.array([env.current_state for env in envs]) # '6: Initialize state s_0 from previous sessions' # if (i_session + 1) % 10 == 0: # Update average parameters every 10 episodes # environment.groups = environment.get_groups () exploration_noise = OrnsteinUhlenbeckNoise (history_length * embeddings.size ()) for t in range (nb_rounds): # '7: for t = 1, T do' # '8: Stage 1: Transition Generating Stage' # '9: Select an action a_t = {a_t^1, ..., a_t^K} according to Algorithm 2' actions, item_idxes = actor.get_recommendation_list ( ra_length, states.reshape (nb_env, -1), # TODO + exploration_noise.get().reshape(1, -1), embeddings) # '10: Execute action a_t and observe the reward list {r_t^1, ..., r_t^K} for each item in a_t' for env, state, action, items in zip(envs, states, actions, item_idxes): sim_results, rewards, next_state = env.step (action, items) # '19: Store transition (s_t, a_t, r_t, s_t+1) in D' replay_memory.add (state.reshape (history_length * embeddings.size ()), action.reshape (ra_length * embeddings.size ()), [rewards], next_state.reshape (history_length * embeddings.size ())) state = next_state # '20: Set s_t = s_t+1' session_reward += rewards # '21: Stage 2: Parameter Updating Stage' if replay_memory.size () >= batch_size * nb_env: # Experience replay replay = True replay_Q_value, critic_loss = experience_replay (replay_memory, batch_size, actor, critic, embeddings, ra_length, history_length * embeddings.size (), ra_length * embeddings.size (), discount_factor) session_Q_value += replay_Q_value session_critic_loss += critic_loss summary_str = sess.run (summary_ops, feed_dict = {summary_vars[0]: session_reward, summary_vars[1]: session_Q_value, summary_vars[2]: session_critic_loss}) writer.add_summary (summary_str, i_session) ''' print(state_to_items(embeddings.embed(data['state'][0]), actor, ra_length, embeddings), state_to_items(embeddings.embed(data['state'][0]), actor, ra_length, embeddings, True)) ''' str_loss = str ('Loss=%0.4f' % session_critic_loss) print (('Episode %d/%d Reward=%d Time=%ds ' + (str_loss if replay else 'No replay')) % (i_session + 1, nb_episodes, session_reward, time.time () - start_time)) start_time = time.time () writer.close () tf.train.Saver ().save (sess, 'models.h5', write_meta_graph = False)