示例#1
0
    state = tetris.get_state()
    episodeReward = 0
    step = 0
    while step < maxStepPerEpisode:

        # if render and numSteps % renderStepDuration == 0:
        #     image = tetris.get_printed_state()
        #     plt.imshow(image)
        #     plt.savefig(episodeDirectory + "s%d.jpg" % step)

        action = select_action(state)
        step += 1
        next_state, reward, done = tetris.step(action)
        if done:
            next_state = None
        memory.add((state, action, reward, next_state))
        state = next_state
        episodeReward += reward

        train()

        numSteps += 1
        if numSteps % numStepPerUpdate == 0:
            targetNet.load_state_dict(policyNet.state_dict())

        if done:
            break

    # if render:
    #     image = tetris.get_printed_state()
    #     plt.imshow(image)
def train_dqn(env,
              num_steps,
              *,
              replay_size,
              batch_size,
              exploration,
              gamma,
              train_freq=1,
              print_freq=100,
              target_network_update_freq=500,
              t_learning_start=1000):
    """
    DQN algorithm.

    Compared to previous training procedures, we will train for a given number
    of time-steps rather than a given number of episodes.  The number of
    time-steps will be in the range of millions, which still results in many
    episodes being executed.

    Args:
        - env: The openai Gym environment
        - num_steps: Total number of steps to be used for training
        - replay_size: Maximum size of the ReplayMemory
        - batch_size: Number of experiences in a batch
        - exploration: a ExponentialSchedule
        - gamma: The discount factor

    Returns: (saved_models, returns)
        - saved_models: Dictionary whose values are trained DQN models
        - returns: Numpy array containing the return of each training episode
        - lengths: Numpy array containing the length of each training episode
        - losses: Numpy array containing the loss of each training batch
    """
    # check that environment states are compatible with our DQN representation
    assert (isinstance(env.observation_space, gym.spaces.Box)
            and len(env.observation_space.shape) == 1)

    # get the state_size from the environment
    state_size = env.observation_space.shape[0]

    # initialize the DQN and DQN-target models
    dqn_model = DQN(state_size, env.action_space.n)
    dqn_target = DQN.custom_load(dqn_model.custom_dump())

    # initialize the optimizer
    optimizer = torch.optim.Adam(dqn_model.parameters(), lr=5e-4)

    # initialize the replay memory
    memory = ReplayMemory(replay_size, state_size)

    # initiate lists to store returns, lengths and losses
    rewards = []
    returns = []
    lengths = []
    losses = []

    last_100_returns = deque(maxlen=100)
    last_100_lengths = deque(maxlen=100)

    # initiate structures to store the models at different stages of training
    saved_models = {}

    i_episode = 0
    t_episode = 0

    state = env.reset()

    # iterate for a total of `num_steps` steps
    for t_total in range(num_steps):
        # use t_total to indicate the time-step from the beginning of training

        if t_total >= t_learning_start:
            eps = exploration.value(t_total - t_learning_start)
        else:
            eps = 1.0
        action = select_action_epsilon_greedy(dqn_model, state, eps, env)
        next_state, reward, done, _ = env.step(action)
        memory.add(state, action, reward, next_state, done)

        rewards.append(reward)
        state = next_state

        if t_total >= t_learning_start and t_total % train_freq == 0:
            batch = memory.sample(batch_size)
            loss = train_dqn_batch(optimizer, batch, dqn_model, dqn_target,
                                   gamma)
            losses.append(loss)

        # update target network
        if t_total >= t_learning_start and t_total % target_network_update_freq == 0:
            dqn_target.load_state_dict(dqn_model.state_dict())

        if done:

            # Calculate episode returns
            G = 0
            for i in range(len(rewards)):
                G += rewards[i] * pow(gamma, i)

            # Collect results
            lengths.append(t_episode + 1)
            returns.append(G)

            last_100_returns.append(G)
            last_100_lengths.append(t_episode + 1)

            if i_episode % print_freq == 0:
                logger.record_tabular("time step", t_total)

                logger.record_tabular("episodes", i_episode)
                logger.record_tabular("step", t_episode + 1)
                logger.record_tabular("return", G)
                logger.record_tabular("mean reward", np.mean(last_100_returns))
                logger.record_tabular("mean length", np.mean(last_100_lengths))

                logger.record_tabular("% time spent exploring", int(100 * eps))
                logger.dump_tabular()

            # End of episode so reset time, reset rewards list
            t_episode = 0
            rewards = []

            # Environment terminated so reset it
            state = env.reset()

            # Increment the episode index
            i_episode += 1

        else:
            t_episode += 1

    return (
        dqn_model,
        np.array(returns),
        np.array(lengths),
        np.array(losses),
    )
示例#3
0
class Learner(object):

	def __init__(self, params, param_set_id, status_dict, shared_state, remote_mem):
		self.params = params
		self.param_set_id = param_set_id
		self.status_dict = status_dict
		self.shared_state = shared_state
		self.remote_mem = remote_mem

		gpu = 0
		torch.cuda.set_device(gpu)

		ep = params['env']
		ap = params['actor']
		lp = params['learner']
		rmp = params["replay_memory"]

		model_formula = f'model.{lp["model"]}(self.state_shape, self.action_dim).to(self.device)'
		optimizer_formula = lp["optimizer"].format('self.Q.parameters()')

		self.conn = psycopg2.connect(params["db"]["connection_string"])
		self.conn.autocommit = True
		self.cur = self.conn.cursor()

		self.device = torch.device("cuda:{}".format(gpu) if 0 <= gpu and torch.cuda.is_available() else "cpu")
		self.state_shape = ep['state_shape']
		self.batch_size = lp['replay_sample_size']
		self.action_dim = ep['action_dim']
		self.q_target_sync_freq = lp['q_target_sync_freq']
		self.num_q_updates = 0
		self.take_offsets = (torch.arange(self.batch_size) * self.action_dim).to(self.device)
		self.Q = eval(model_formula)
		self.Q_target = eval(model_formula) # Target Q network which is slow moving replica of self.Q
		self.optimizer = eval(optimizer_formula)
		self.replay_memory = ReplayMemory(rmp)

		self.train_num = 0
		self.model_file_name = lp['load_saved_state']
		if self.model_file_name and os.path.isfile(self.model_file_name):
			print(f'Loading {self.model_file_name}')
			saved_state = torch.load(self.model_file_name)
			self.Q.load_state_dict(saved_state['module'])
			self.optimizer.load_state_dict(saved_state['optimizer'])
			self.train_num = saved_state['train_num']

		self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu(
		    self.Q_target.state_dict())
		self.status_dict['Q_state_dict_stored'] = True

		self.last_Q_state_dict_id = 1
		self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id
		self.status_dict['train_num'] = self.train_num

		self.gamma_n = params['actor']['gamma']**params['actor']['num_steps']

	def state_dict_to_cpu(self, state_dict):
		d = OrderedDict()
		for k, v in state_dict.items():
			d[k] = v.cpu()
		return d

	def add_experience_to_replay_mem(self):
		while self.remote_mem.qsize():
			priorities, batch = self.remote_mem.get()
			self.replay_memory.add(priorities, batch)

	def compute_loss_and_priorities(self, batch_size):
		indices, n_step_transition_batch, before_priorities = self.replay_memory.sample(batch_size)

		s = n_step_transition_batch[0].to(self.device)
		a = n_step_transition_batch[1].to(self.device)
		r = n_step_transition_batch[2].to(self.device)
		a_latest = n_step_transition_batch[3].to(self.device)
		s_latest = n_step_transition_batch[4].to(self.device)
		terminal = n_step_transition_batch[5].to(self.device)

		q = self.Q(s)
		q_a = q.take(self.take_offsets + a).squeeze()

		with torch.no_grad():
			self.Q_target.eval()
			Gt = r + (1.0 - terminal) * self.gamma_n * self.Q_target(s_latest).take(self.take_offsets + a_latest).squeeze()
			td_error = Gt - q_a

		loss = F.smooth_l1_loss(q_a, Gt)
		# loss = td_error**2 / 2

		# Compute the new priorities of the experience
		after_priorities = td_error.data.abs().cpu().numpy()
		self.replay_memory.set_priorities(indices, after_priorities)

		return loss, q, before_priorities, after_priorities, indices

	def update_Q(self, loss):
		self.optimizer.zero_grad()
		loss.backward()
		self.optimizer.step()
		self.num_q_updates += 1

		if self.num_q_updates % self.q_target_sync_freq == 0:
			self.Q_target.load_state_dict(self.Q.state_dict())
			print(f'Target Q synchronized.')
			return True
		else:
			return False

	def learn(self):
		t = tables.LearnerData()
		record_type = t.get_record_type()
		record_insert = t.get_insert()
		cur = self.cur
		param_set_id = self.param_set_id
		now = datetime.datetime.now
		step_num = 0
		target_sync_num = 0
		send_param_num = 0
		min_replay_mem_size = self.params['learner']["min_replay_mem_size"]

		print('learner waiting for replay memory.')
		while self.replay_memory.size() <= min_replay_mem_size:
			self.add_experience_to_replay_mem()
			time.sleep(0.01)
		step_num = 0
		print('learner start')
		while not self.status_dict['quit']:
			self.add_experience_to_replay_mem()
			# 4. Sample a prioritized batch of transitions
			# 5. & 7. Apply double-Q learning rule, compute loss and experience priorities
			# 8. Update priorities
			loss, q, before_priorities, after_priorities, indices = self.compute_loss_and_priorities(self.batch_size)
			if step_num % 10 == 0:
				print(f'loss : {loss}')
			#print("\nLearner: step_num=", step_num, "loss:", loss, "RPM.size:", self.replay_memory.size(), end='\r')
			# 6. Update parameters of the Q network(s)
			if self.update_Q(loss):
				target_sync_num += 1
			if step_num % 5 == 0:
				self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu(
				    self.Q_target.state_dict())
				self.last_Q_state_dict_id += 1
				self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id
				print('Send params to actors.')
				send_param_num += 1

			# 9. Periodically remove old experience from replay memory
			step_num += 1
			self.train_num += 1
			self.status_dict['train_num'] = self.train_num

			# DBへデータ登録
			r = record_type(param_set_id, now(), self.train_num,
			                step_num, loss.item(), q[0].tolist(), before_priorities.tolist(), after_priorities.tolist(),
			                indices.tolist(), target_sync_num, send_param_num)
			record_insert(cur, r)

		print('learner end')

		state_dict = {'module': self.Q.state_dict(), 'optimizer': self.optimizer.state_dict(), 'train_num': self.train_num}
		torch.save(state_dict, self.model_file_name)
def train(sess, environment, actor, critic, embeddings, history_length, ra_length, buffer_size, batch_size,
	  discount_factor, nb_episodes, filename_summary, nb_rounds, **env_args):
	''' Algorithm 3 in article. '''

	# Set up summary operators
	def build_summaries():
		episode_reward = tf.Variable (0.)
		tf.summary.scalar ('reward', episode_reward)
		episode_max_Q = tf.Variable (0.)
		tf.summary.scalar ('max_Q_value', episode_max_Q)
		critic_loss = tf.Variable (0.)
		tf.summary.scalar ('critic_loss', critic_loss)

		summary_vars = [episode_reward, episode_max_Q, critic_loss]
		summary_ops = tf.summary.merge_all ()
		return summary_ops, summary_vars

	summary_ops, summary_vars = build_summaries ()
	sess.run (tf.global_variables_initializer ())
	writer = tf.summary.FileWriter (filename_summary, sess.graph)

	# '2: Initialize target network f′ and Q′'
	actor.init_target_network ()
	critic.init_target_network ()

	# '3: Initialize the capacity of replay memory D'
	replay_memory = ReplayMemory(buffer_size)  # Memory D in article
	replay = False

	start_time = time.time ()
	for i_session in range (nb_episodes):  # '4: for session = 1, M do'
		session_reward = 0
		session_Q_value = 0
		session_critic_loss = 0

		# '5: Reset the item space I' is useless because unchanged.
		nb_env = 10
		envs = np.asarray([Environment(**env_args) for i in range(nb_env)])
# 		u = [e.current_user for e in envs]
# 		print(u)
# 		input()
		states = np.array([env.current_state for env in envs])  # '6: Initialize state s_0 from previous sessions'
		
	#         if (i_session + 1) % 10 == 0:  # Update average parameters every 10 episodes
	#             environment.groups = environment.get_groups ()

		exploration_noise = OrnsteinUhlenbeckNoise (history_length * embeddings.size ())

		for t in range (nb_rounds):  # '7: for t = 1, T do'
			# '8: Stage 1: Transition Generating Stage'

			# '9: Select an action a_t = {a_t^1, ..., a_t^K} according to Algorithm 2'
			actions, item_idxes = actor.get_recommendation_list (
				ra_length,
				states.reshape (nb_env, -1),  # TODO + exploration_noise.get().reshape(1, -1),
				embeddings)

			# '10: Execute action a_t and observe the reward list {r_t^1, ..., r_t^K} for each item in a_t'
			for env, state, action, items in zip(envs, states, actions, item_idxes):
				sim_results, rewards, next_state = env.step (action, items)

				# '19: Store transition (s_t, a_t, r_t, s_t+1) in D'
				replay_memory.add (state.reshape (history_length * embeddings.size ()),
								   action.reshape (ra_length * embeddings.size ()),
								   [rewards],
								   next_state.reshape (history_length * embeddings.size ()))

				state = next_state  # '20: Set s_t = s_t+1'

				session_reward += rewards

			# '21: Stage 2: Parameter Updating Stage'
			if replay_memory.size () >= batch_size * nb_env:  # Experience replay
				replay = True
				replay_Q_value, critic_loss = experience_replay (replay_memory, batch_size,
																 actor, critic, embeddings, ra_length,
																 history_length * embeddings.size (),
																 ra_length * embeddings.size (), discount_factor)
				session_Q_value += replay_Q_value
				session_critic_loss += critic_loss

			summary_str = sess.run (summary_ops,
									feed_dict = {summary_vars[0]: session_reward,
												 summary_vars[1]: session_Q_value,
												 summary_vars[2]: session_critic_loss})

			writer.add_summary (summary_str, i_session)

			'''
			print(state_to_items(embeddings.embed(data['state'][0]), actor, ra_length, embeddings),
				  state_to_items(embeddings.embed(data['state'][0]), actor, ra_length, embeddings, True))
			'''

		str_loss = str ('Loss=%0.4f' % session_critic_loss)
		print (('Episode %d/%d Reward=%d Time=%ds ' + (str_loss if replay else 'No replay')) % (i_session + 1, nb_episodes, session_reward, time.time () - start_time))
		start_time = time.time ()

	writer.close ()
	tf.train.Saver ().save (sess, 'models.h5', write_meta_graph = False)