def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): Q = defaultdict(lambda: np.zeros(env.action_space.n)) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), methods="q-learning") policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in tqdm(range(num_episodes)): state = env.reset() action_prob = policy(state) action = np.random.choice(np.arange(env.action_space.n), p=action_prob) for t in itertools.count(): next_state, reward, done, _ = env.step(action) next_action_prob = policy(next_state) next_action = np.random.choice(np.arange(env.action_space.n), p=next_action_prob) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t td_target = reward + discount_factor*np.max(Q[next_state]) td_error = td_target - Q[state][action] Q[state][action] += alpha * td_error if done: break state = next_state action = next_action return Q, stats
def sarsa_control_epsilon_greedy(env, n_episodes, epsilon, discount_factor, alpha ): Q=defaultdict(lambda: np.zeros(env.action_space.n)) final_policy=defaultdict(lambda: np.zeros(env.action_space.n)) stats = plotting.EpisodeStats(episode_lengths=np.zeros(n_episodes),episode_rewards=np.zeros(n_episodes)) for i in range(n_episodes): state=env.reset() done=False prob= epsilon_greedy_policy(state, Q, epsilon, env.action_space.n) action = np.random.choice(np.arange(len(prob)), p=prob) while not done: next_state, reward, done, _ = env.step(action) next_prob = epsilon_greedy_policy(next_state, Q , epsilon, env.action_space.n) next_action = np.random.choice(np.arange(len(next_prob)),p=next_prob ) Q[state][action] += alpha*(reward + discount_factor * Q[next_state][next_action] - Q[state][action]) stats.episode_rewards[i] += reward stats.episode_lengths[i] += 1 state=next_state action=next_action for _state in Q: final_policy[_state]= epsilon_greedy_policy( _state, Q, 0.0 , env.action_space.n) return Q, final_policy , stats
def __init__(self, sess, env, saver, q_estimator, target_estimator, state_processor, config): self.sess = sess self.env = env self.saver = saver self.q_estimator = q_estimator self.target_estimator = target_estimator self.params = config self.replay_memory = [] self.state_processor = state_processor self.stats = plotting.EpisodeStats( episode_lengths=np.zeros(config.num_episodes), episode_rewards=np.zeros(config.num_episodes)) latest_checkpoint = tf.train.latest_checkpoint(config.checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) self.saver.restore(sess, latest_checkpoint) self.total_t = sess.run(tf.contrib.framework.get_global_step()) self.epsilons = np.linspace(config.epsilon_start, config.epsilon_end, config.epsilon_decay_steps) self.policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))
def gibbs_sampling(self, num_episodes): stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in range(num_episodes): self.temperature = 1 self.reset() #self.learning_rate = self.learning_rate * 0.9 self.temperature = self.temperature - 0.9 / 50 for i in itertools.count(): self._update_hiddens() self.update_action(update=True) free_energy_1 = self.get_free_energy() reward_ = self._update_states() #print 'New State', self.old_state stats.episode_rewards[i_episode] += reward_ stats.episode_lengths[i_episode] = i if self.old_state == 15 or self.old_state == 0: print 'Ith episode, episode len', i_episode, i break free_energy_2 = self.update_action(update=True) diff = reward_ + self.discount_factor * free_energy_2 - free_energy_1 self._update_action_weights(diff) self._update_state_weights(diff) return stats
def run_sarsa(self, max_number_of_episodes=100, interactive=False, display_frequency=1): # repeat for each episode for episode_number in range(max_number_of_episodes): # initialize state state = self.env.reset() done = False # used to indicate terminal state R = 0 # used to display accumulated rewards for an episode t = 0 # used to display accumulated steps for an episode i.e episode length # choose action from state using policy derived from Q action = self.agent.act(state) # repeat for each step of episode, until state is terminal while not done: t += 1 # increase step counter - for display # take action, observe reward and next state next_state, reward, done, _ = self.env.step(action) # choose next action from next state using policy derived from Q next_action = self.agent.act(next_state) # agent learn (SARSA update) self.agent.learn(state, action, reward, next_state, next_action) # state <- next state, action <- next_action state = next_state action = next_action R += reward # accumulate reward - for display # if interactive display, show update for each step if interactive: self.update_display_step() self.episode_length = np.append( self.episode_length, t) # keep episode length - for display self.episode_reward = np.append( self.episode_reward, R) # keep episode reward - for display # if interactive display, show update for the episode if interactive: self.update_display_episode() # if not interactive display, show graph at the end if not interactive: self.fig.clf() stats = plotting.EpisodeStats( episode_lengths=self.episode_length, episode_rewards=self.episode_reward, episode_running_variance=np.zeros(max_number_of_episodes)) plotting.plot_episode_stats(stats, display_frequency)
def qLearning(env, num_episodes, discount_factor=1.0, alpha=0.6, epsilon=0.1): """ Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy while improving following an epsilon-greedy policy""" # Action value function # A nested dictionary that maps # state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.no_actions)) # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Create an epsilon greedy policy function # appropriately for environment action space policy = createEpsilonGreedyPolicy(Q, epsilon, env.no_actions) # For every episode for ith_episode in range(num_episodes): # Reset the environment and pick the first action state = env.reset() for i in range(env.number_of_vehicles): # get probabilities of all actions from current state action_probabilities = policy(state) # choose action according to # the probability distribution action = np.random.choice(np.arange(len(action_probabilities)), p=action_probabilities) # take action and get reward, transit to next state reward, next_state, done = env.step(action) # print(state) # print(action) # print(next_state) # print("___________________________________________") # Update statistics stats.episode_rewards[ith_episode] += reward stats.episode_lengths[ith_episode] = i # TD Update best_next_action = np.argmax(Q[str(next_state)]) td_target = reward + discount_factor * Q[str( next_state)][best_next_action] td_delta = td_target - Q[str(state)][action] Q[str(state)][action] += alpha * td_delta # done is True if episode terminated if done: break state = next_state return Q, stats
def g_learning(self, num_episodes, max_ep_steps=10000, discount=1.0, epsilon=0.1): """ The G-learning algorithm. Args: num_episodes: Number of episodes to run. max_ep_steps: Maximum time steps allocated to one episode. discount: Standard discount factor, usually denoted as \gamma. epsilon: Probability of taking random actions during exploration. Returns: A tuple (G, stats) of the G-values and statistics, which should be plotted and thoroughly analyzed. """ cum_t = 0 stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), kl_divergence=np.zeros(num_episodes)) for i_episode in tqdm(range(num_episodes)): state = env.reset() # Run this episode until we finish as indicated by the environment. for t in range(1, max_ep_steps + 1): # Uses exploration policy to take a step. action = self.policy_exploration(state, epsilon) next_state, reward, done, _ = env.step(action) # cost = -reward # print(reward) # Collect statistics (cum_t currently not used). stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t stats.kl_divergence[i_episode] = self.kl_divergence() self.N[state, action] += 1 cum_t += 1 # Intermediate terms for the G-learning update. alpha = self.alpha_schedule(t, state, action) beta = self.beta_schedule(t) temp = np.sum(self.rho[next_state, :] * np.exp(-beta * self.G[next_state, :])) # Official G-learning update at last. Equation 18 in the paper. td_target = -reward - (discount / beta) * np.log(temp) td_delta = td_target - self.G[state, action] self.G[state, action] += (alpha * td_delta) if done: break state = next_state print return self.G, stats
def simulate(self, pi, n_trial, n_episode, return_stats=False): """TODO: Docstring for simulate Parameters ---------- pi : behavior policy Returns ------- D: a collection of transition samples """ stats = plotting.EpisodeStats(episode_lengths=np.zeros(n_episode), episode_rewards=np.zeros(n_episode)) D = [] env = self._env for trial_i in range(n_trial): #D_t = D[trial_i] for epi_i in range(n_episode): last_reward = stats.episode_rewards[epi_i - 1] sys.stdout.flush() #D_e = D_t[epi_i] traj = [] s = env.reset() for t in count(): a = pi.choose_action(s) s_next, r, done, _ = env.step(a) stats.episode_rewards[epi_i] += r stats.episode_lengths[epi_i] = t logging.debug("s {} a {} s_next {} r {} done {}".format( s, a, r, s_next, done)) transition = T(s=s, a=a, r=r, s_next=s_next, done=done) traj.append(transition) s = s_next if done: logging.debug("done after {} steps".format(t)) break print("\rStep {} @ Episode {}/{} ({})".format( t, epi_i + 1, n_episode, last_reward), end="") D.append(traj) if return_stats: return D, stats else: return D
def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.6, epsilon=0.1): """ Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy while improving following an epsilon-greedy policy """ # Action value function # A nested dictionary that maps # state -> (action -> action-value). q = defaultdict(lambda: np.zeros(env.action_space.n)) # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Create an epsilon greedy policy function # appropriate for environment action space policy = create_epsilon_greedy_policy(q, epsilon, env.action_space.n) # For every episode for ith_episode in range(num_episodes): # Reset the environment and pick the first action state = env.reset() for t in itertools.count(): # get probabilities of all functions from current state actions_probabilities = policy(state) # choose action according to # the probability distribution action = np.random.choice(np.arange(len(actions_probabilities)), p=actions_probabilities) # take action and get reward, transit to next state next_state, reward, done, _ = env.step(action) # Update statistics stats.episode_rewards[ith_episode] += reward stats.episode_lengths[ith_episode] = t # TD Update best_next_action = np.argmax(q[next_state]) td_target = reward + discount_factor * q[next_state][ best_next_action] td_delta = td_target - q[state][action] q[state][action] += alpha * td_delta # done is TRUE if episode terminated if done: break state = next_state return q, stats
def Q_learning(env, num_episodes, discount_factor=0.4, alpha=0.9, epsilon=0.5): """ Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy while improving following an epsilon-greedy policy""" # Action value function # A nested dictionary that maps # state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.action_space.n)) # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Create an epsilon greedy policy function # appropriately for environment action space policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n) # For every episode for ith_episode in range(num_episodes): #reset the enviroment and pick the 1st action state = env.reset() for t in itertools.count(): # get probabilities of all actions from current state action_probabilities = policy(state) # choose action according to # the probability distribution action = np.random.choice(np.arange(len(action_probabilities)), p=action_probabilities) # take action and get reward, transit to next state next_state, reward, done, _ = env.step(action) # Update statistics stats.episode_rewards[ith_episode] += reward stats.episode_lengths[ith_episode] = t # TD Update best_next_action = np.argmax(Q[next_state]) TD = reward + discount_factor * Q[next_state][ best_next_action] - Q[state][action] Q[state][action] += alpha * TD # env.render() # done is True if episode terminated if done: # env.render() break state = next_state return Q, stats
def q_learning(self, num_episodes, max_ep_steps=10000, discount=1.0, epsilon=0.1): """ The Q-learning algorithm. Args: num_episodes: Number of episodes to run. max_ep_steps: Maximum time steps allocated to one episode. discount: Standard discount factor, usually denoted as \gamma. epsilon: Probability of taking random actions during exploration. Returns: A tuple (Q, stats) of the Q-values and statistics, which should be plotted and thoroughly analyzed. """ cum_t = 0 stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), kl_divergence=np.zeros(num_episodes)) for i_episode in tqdm(range(num_episodes)): state = env.reset() # Run this episode until we finish as indicated by the environment. for t in range(1, max_ep_steps + 1): # Uses exploration policy to take a step. action = self.policy_exploration(state, epsilon) next_state, reward, done, _ = env.step(action) # Collect statistics (cum_t currently not used). stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t stats.kl_divergence[i_episode] = self.kl_divergence() self.N[state, action] += 1 cum_t += 1 # The official Q-learning update. alpha = self.alpha_schedule(t, state, action) best_next_action = np.argmax(self.Q[next_state, :]) td_target = reward + discount * self.Q[next_state, best_next_action] td_delta = td_target - self.Q[state, action] self.Q[state, action] += (alpha * td_delta) if done: break state = next_state print("") return self.Q, stats
def n_step_sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1, n=5): """ args: n: definite the n-step """ Q = defaultdict(lambda: np.zeros(env.action_space.n)) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), methods="n_step_sarsa") policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in tqdm(range(num_episodes)): state = env.reset() action_prob = policy(state) action = np.random.choice(np.arange(env.action_space.n), p=action_prob) state_store = [] action_store = [] reward_store = [] T = 100000 for t in itertools.count(): if t < T: next_state, reward, done, _ = env.step(action) state_store.append(next_state) reward_store.append(reward) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: T = t + 1 action_store.append(action) else: next_action_prob = policy(next_state) next_action = np.random.choice(np.arange(env.action_space.n), p=next_action_prob) action_store.append(next_action) state = next_state action = next_action tao = t - n + 1 if tao >= 0: G = 1.0 for i in range(tao+1, min(tao+n, T)): G += (discount_factor ** (i-tao-1))*reward_store[i] if tao + n < T: G += (discount_factor ** n) * Q[state_store[tao+n-1]][action_store[tao+n-1]] Q[state_store[tao]][action_store[tao]] += alpha*(G - Q[state_store[tao]][action_store[tao]]) if tao == T - 1: break return Q, stats
def __init__(self,name, lock, batch, env, actor, critic, gamma, max_episodes, episodes_to_train): threading.Thread.__init__(self) self.name = name self.env = env self.batch = batch self.actor = actor self.critic = critic self.lock = lock self.gamma = gamma self.max_episodes = max_episodes self.episodes_to_train = episodes_to_train self.stats = plotting.EpisodeStats( episode_lengths=np.zeros(10000), episode_rewards=np.zeros(10000))
def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1, exp=False): """ Args: env: environment num_episodes discount_factor: gamma in the updated equ alpha: learning_rate epsilon: the prob of exploray exp: whether use Expected SARSA or not """ Q = defaultdict(lambda: np.zeros(env.action_space.n)) method = "Expected SARSAR" if exp else "SARSA" stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), methods=method) policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in tqdm(range(num_episodes)): state = env.reset() action_prob = policy(state) action = np.random.choice(np.arange(env.action_space.n), p=action_prob) for t in itertools.count(): next_state, reward, done, _ = env.step(action) next_action_prob = policy(next_state) next_action = np.random.choice(np.arange(env.action_space.n), p=next_action_prob) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if exp: # use expected sarsa! td_target = reward + discount_factor * np.dot( Q[next_state], next_action_prob) td_error = td_target - Q[state][action] Q[state][action] += alpha * td_error else: td_target = reward + discount_factor * Q[next_state][ next_action] td_error = td_target - Q[state][action] Q[state][action] += alpha * td_error if done: break action = next_action state = next_state return Q, stats
def q_learning_testing_rewards(env, estimator, reward_fn, num_episodes, discount_factor=1.0, epsilon=0.0, epsilon_decay=1.0, render=False, ep_details=False): ''' Given the reward function, The RL agent learns the best policy. (Used for generating final results to compare with learning with default handcrafted rewards.) ''' # Statistics during learning process stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i in tqdm(range(num_episodes)): state = env.reset() done = False d = 0 while not done and d <= 2000: prob = epsilon_greedy_policy(state, estimator, epsilon * epsilon_decay**i, env.action_space.n) action = np.random.choice(np.arange(len(prob)), p=prob) step = env.step(action) next_state = step[0] done = step[2] reward = reward_fn(state) if render: env.render() stats.episode_rewards[i] += reward stats.episode_lengths[i] += 1 q_values_next = estimator.predict(next_state) td_target = reward + discount_factor * np.max(q_values_next) estimator.update(state, action, td_target) state = next_state d += 1 if ep_details: print("Episode {} completed in {} timesteps".format(i, d)) return stats
def sarsa(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0): stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in range(num_episodes): policy = make_epsilon_greedy_policy(estimator, epsilon * epsilon_decay**i_episode, env.action_space.n) state = env.reset() action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) for t in itertools.count(): next_state, reward, end, _ = env.step(action) next_action_probs = policy(next_state) next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t q_values_next = estimator.predict(next_state) td_target = reward + discount_factor * q_values_next[next_action] estimator.update(state, action, td_target) if i_episode % 10 == 0: print("\rStep {} @ Episode {}/{} ({})".format( t, i_episode + 1, num_episodes, reward)) if end: break state = next_state action = next_action return stats
def q_learning(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0): # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in range(num_episodes): # The policy we're following policy = make_epsilon_greedy_policy(estimator, epsilon * epsilon_decay**i_episode, env.action_space.n) # Print out which episode we're on, useful for debugging. # Also print reward for last episode last_reward = stats.episode_rewards[i_episode - 1] print("\rEpisode {}/{} ({})".format(i_episode + 1, num_episodes, last_reward), end="") sys.stdout.flush() done = False state = env.reset() while not done: probs = policy(state) action = np.random.choice(np.arange(len(probs)), p=probs) nextstate, reward, done, _ = env.step(action) target = reward + discount_factor * np.max( estimator.predict(nextstate)) estimator.update(state, action, target) stats.episode_lengths[i_episode] += 1 stats.episode_rewards[i_episode] += reward state = nextstate return stats
def initialize(self): self.step_idx = 0 self.done_episodes = 0 self.states, self.actions,self.adv, self.not_done_idx, self.last_states = [], [], [], [], [] self.discounted_rewards = [] self.epochs = 0 self.done = False self.show = False self.video = 0 self.solved = False self.episodes = 0 self.video_index = 0 self.last_rewards = collections.deque() self.state = self.env.reset() self.stats = plotting.EpisodeStats( episode_lengths=np.zeros(10000), episode_rewards=np.zeros(10000)) if self.save_rendering: self.rec = gym.wrappers.monitoring.video_recorder.VideoRecorder(self.env, path=self.videopath + "/video1.mp4") self.actor_model = self.actor.build_model() self.critic_model = self.critic.build_model()
def expected_sarsa(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.015, epsilon_decay=1.0): stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) rlist = [] for i_episode in range(num_episodes): policy = make_epsilon_greedy_policy(estimator, epsilon * epsilon_decay**i_episode, env.action_space.n) #last_reward = stats.episode_rewards[i_episode - 1] state = env.reset() next_action = None for j in itertools.count(): cum_reward = 0 if next_action is None: action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) else: action = next_action env.render() next_state, reward, done, _ = env.step(action) cum_reward += reward stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = j Q_val = estimator.predict(next_state) action_probs = policy(next_state) td_target = reward + discount_factor * sum(action_probs * Q_val) estimator.update(state, action, td_target) if done: print('Episode no', i_episode) rlist.append(cum_reward) break state = next_state return stats
def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.action_space.n)) # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # The policy we're following policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in range(num_episodes): if (i_episode + 1) % 100 == 0: print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="") sys.stdout.flush() done = False t = 0 state = env.reset() props = policy(state) action = np.random.choice(np.arange(len(props)), p=props) while not done: nextstate, reward, done, _ = env.step(action) props = policy(nextstate) nextaction = np.random.choice(np.arange(len(props)), p=props) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t Q[state][action] += alpha * ( reward + discount_factor * Q[nextstate][nextaction] - Q[state][action]) state = nextstate action = nextaction t += 1 return Q, stats
def q_learning_best_policy(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.0, epsilon_decay=1.0, print_ep_lens=False): ''' ** RL Code for the learning part of the expert agent. This does not take the reward function. It uses the default environment reward function. ''' # Statistics during learning process stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i in tqdm(range(num_episodes)): state = env.reset() done = False d = 0 while not done: prob = epsilon_greedy_policy(state, estimator, epsilon * epsilon_decay**i, env.action_space.n) action = np.random.choice(np.arange(len(prob)), p=prob) next_state, reward, done, _ = env.step(action) stats.episode_rewards[i] += reward stats.episode_lengths[i] += 1 q_values_next = estimator.predict(next_state) td_target = reward + discount_factor * np.max(q_values_next) estimator.update(state, action, td_target) state = next_state d += 1 if print_ep_lens: print("Episode {} completed in {} timesteps".format(i, d)) return stats
def fa_test(env, estimator, num_episodes): stats_test = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_transbag=np.zeros(num_episodes)) for i_episode in range(num_episodes): # The policy we're following # Print out which episode we're on, useful for debugging. # Also print reward for last episode last_reward = stats_test.episode_transbag[i_episode - 1] sys.stdout.flush() # Reset the environment and pick the first action state = env.reset_test() # One step in the environment for t in itertools.count(): q_values = estimator.predict(state) best_action = np.argmax(q_values) next_state, reward, done, data_overflow = env.step(best_action) # Update statistics stats_test.episode_rewards[i_episode] += reward stats_test.episode_lengths[i_episode] = t stats_test.episode_transbag[i_episode] += data_overflow if done: break if t >= 1000: break state = next_state print("\r@ Episode {}/{} ({})".format(i_episode + 1, num_episodes, last_reward), end="") return stats_test
inputs = Input(shape=(env.observation_space.shape[0], )) x = Dense(128, activation='relu')(inputs) predictions = Dense(env.action_space.n, activation='softmax')(x) model = Model(inputs, predictions) total_rewards = [] step_idx = 0 done_episodes = 0 batch_episodes = 0 batch_states, batch_actions, batch_scales = [], [], [] cur_rewards = [] gamma = 0.99 stats = plotting.EpisodeStats(episode_lengths=np.zeros(TOTAL_EPISODES), episode_rewards=np.zeros(TOTAL_EPISODES)) model.reset_states() discounted_reward = 0 step_idx = 0 for i in range(1, TOTAL_EPISODES): state = env.reset() epochs, penalties, reward, = 0, 0, 0 done = False while not done: step_idx += 1 action = np.random.choice( [a for a in range(env.action_space.n)],
RANDOM_incentives = [] POPULAR_rewards = [] POPULAR_serveratio = [] POPULAR_incentives = [] MIN_rewards = [] MIN_serveratio = [] MIN_incentives = [] rewards = [] # Number of trials (episodes) no_episodes = 50 stats = plotting.EpisodeStats(episode_lengths=np.zeros(no_episodes), episode_rewards=np.zeros(no_episodes)) T = 2000 number_of_contents = 10 myenv = MyEnv(density=density, T=T, number_of_contents=number_of_contents) if (RL == False): RL = DeepQNetwork(myenv.no_actions, myenv.observation_length, learning_rate=0.001, reward_decay=0.9, e_greedy=0.9, replace_target_iter=5000, memory_size=2000,
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ Q-Learning algorithm for fff-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets state_processor: A StateProcessor object num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory replay_memory_init_size: Number of random experiences to sampel when initializing the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Lambda time discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done epsilon_decay_steps: Number of steps to decay epsilon over batch_size: Size of batches to sample from the replay memory record_video_every: Record a video every N episodes Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ print('replay_memory_size = ' + str(replay_memory_size)) print('replay_memory_init_size = ' + str(replay_memory_init_size)) print('update_target_estimator_every = ' + str(update_target_estimator_every)) print('epsilon_decay_steps = ' + str(epsilon_decay_steps)) print('batch_size = ' + str(batch_size)) print('numBlocks = ' + str(numBlocks)) print('n_steps = ' + str(n_steps)) print('n_hidden = ' + str(n_hidden)) # dict = {'replay_memory_size': str(replay_memory_size)} dict = { 'replay_memory_size': replay_memory_size, 'replay_memory_init_size': replay_memory_init_size, 'update_target_estimator_every': update_target_estimator_every, 'epsilon_decay_steps': epsilon_decay_steps, 'batch_size': batch_size, 'numBlocks': numBlocks, n_steps: 'n_steps', 'n_hidden': n_hidden } file = open(experiment_dir + '/dump.txt', 'wb') pickle.dump(str.encode(str(dict)), file) file.close() #Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) Transition = namedtuple( "Transition", ["state", "action", "reward", "next_state", "done", "prev_states"]) # The replay memory replay_memory = [] # Make model copier object estimator_copy = ModelParametersCopier(q_estimator, target_estimator) # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # For 'system/' summaries, usefull to check if currrent process looks healthy current_process = psutil.Process() # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) # Get the current time step total_t = sess.run(tf.contrib.framework.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS)) # The following code will show all the trainable variables #variables_names = [v.name for v in tf.trainable_variables()] #values = sess.run(variables_names) #for k, v in zip(variables_names, values): # print ("Variable: ", k) # print ("Shape: ", v.shape) # print (v) #input ('Press any key') # Populate the replay memory with initial experience print("Populating replay memory...") state = env.reset() state = state_processor.process(sess, state) # print (state.shape) # print (state) # state = np.stack([state] * 4, axis=2) t = 0 for i in range(replay_memory_init_size): #New version follows the existing policy, only useful if the estimator has been pre-loaded prev_states_current, prev_states_next = computePreviousStates( replay_memory, n_steps) action_probs = policy(sess, prev_states_current, epsilons[min(total_t, epsilon_decay_steps - 1)], True) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) #print ("\nTaking action " + str(VALID_ACTIONS[action])) #Old version without following the policy #action = np.random.choice(VALID_ACTIONS) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) #next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) prev_states_current, prev_states_next = computePreviousStates( replay_memory, n_steps) # Incorrect code, delete #if (t!=0): # print ('Inside loop t not zero') # print (state) # state[numBlocks:]=numBlocks+1 # print (state) # Move the following line inside the if #replay_memory.append(Transition(state, action, reward, next_state, done,prev_states_next)) # Trick to consider the case where we end the episode, the next state will change # because we reset the environment orig_state = state if done: t = 0 state = env.reset() state = state_processor.process(sess, state) # Note that we use state as the next_state because we have reset the environment replay_memory.append( Transition(orig_state, action, reward, state, done, prev_states_next)) #state = np.stack([state] * 4, axis=2) else: replay_memory.append( Transition(state, action, reward, next_state, done, prev_states_next)) state = next_state t = t + 1 # Record videos # Add env Monitor wrapper #env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, resume=True) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset() env.render() state = state_processor.process(sess, state) #state = np.stack([state] * 4, axis=2) loss = None print('\n******** NEW EPISODE ***********************************\n') # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] # Maybe update the target estimator if total_t % update_target_estimator_every == 0: estimator_copy.make(sess) print("\nCopied model parameters to target network.") # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() # Take a step prev_states_current, prev_states_next = computePreviousStates( replay_memory, n_steps) #action_probs = policy(sess, state, epsilon) action_probs = policy(sess, prev_states_current, epsilon, True) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) print("\nTaking action " + str(VALID_ACTIONS[action])) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) #env.render() next_state = state_processor.process(sess, next_state) # next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element if len(replay_memory) == replay_memory_size: replay_memory.pop(0) # Save transition to replay memory # Here we should consider that when the episode is done next_state should be replaced # by the first state in the next episode? replay_memory.append( Transition(state, action, reward, next_state, done, prev_states_next)) #replay_memory.append(Transition(state, action, reward, next_state, done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # Sample a minibatch from the replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch, prev_next_states_batch = map( np.array, zip(*samples)) # Shape of next_states_batch is batch_size*n_input #print (next_states_batch.shape) #print (prev_next_states_batch.shape) # Calculate q values and targets # q_values_next = target_estimator.predict(sess, next_states_batch) #print (prev_next_states_batch[0]) # Shape of prev_next_states_batch is batch_size*n_steps*n_output prev_next_states_batch = np.reshape(prev_next_states_batch, (-1, n_steps, numBlocks * 2)) #print (prev_next_states_batch[0]) #print (prev_next_states_batch.shape) q_values_next = target_estimator.predict(sess, prev_next_states_batch, False) #print ('Q values next') #print (q_values_next.shape) #print (q_values_next) targets_batch = reward_batch + np.invert(done_batch).astype( np.float32) * discount_factor * np.amax(q_values_next, axis=1) #print ('Targets') #print (targets_batch.shape) #print (targets_batch) # Perform gradient descent update #states_batch = np.array(states_batch) states_batch = np.array(prev_next_states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) if done: print('PROBLEM SOLVED') print('\n*******************************************\n') #We just make this call because we want to store the last state of the episode prev_states_current, prev_states_next = computePreviousStates( replay_memory, n_steps) action_probs = policy(sess, prev_states_current, epsilon, True) break #if (loss>500): # break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="episode/epsilon") episode_summary.value.add( simple_value=stats.episode_rewards[i_episode], tag="episode/reward") episode_summary.value.add( simple_value=stats.episode_lengths[i_episode], tag="episode/length") episode_summary.value.add(simple_value=current_process.cpu_percent(), tag="system/cpu_usage_percent") episode_summary.value.add( simple_value=current_process.memory_percent(memtype="vms"), tag="system/v_memeory_usage_percent") q_estimator.summary_writer.add_summary(episode_summary, i_episode) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode + 1], episode_rewards=stats.episode_rewards[:i_episode + 1]) return stats
def actor_critic(env, actor, critic, num_episodes, num_timesteps=5000, discount_factor=1.0): """ Actor Critic Algorithm. Optimizes the policy function approximator using policy gradient. Args: env: My self created environment, specified above. actor: Policy Function to be optimized critic: Value function approximator num_episodes: Number of episodes to run for discount_factor: Time-discount factor Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # Keeps track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) funds_wealth_all_episodes = [] funds_return_all_ep = [] learning_fund_stats = np.zeros((num_episodes, num_timesteps, 6)) for i_episode in range(num_episodes): # Reset everything prices = [] funds_wealth = [] funds_returns = [] # Create our learning_fund learning_fund = LearningFund() # Create the funds (DynamicFund class is described in thurner_model.py) number_of_funds = 10 funds = [DynamicFund((i+1)*5) for i in range(number_of_funds)] # Add our learning fund funds.append(learning_fund) # Reset the environment env.reset() episode = [] # One step in the environment for t in range(num_timesteps): # get the demand of the learning fund # (via getting demand from actor) demand = learning_fund.get_demand(env.p_t) state = learning_fund.get_state(env.p_t) # Simulate a step in the environment, # record the wealth of all funds in current_wealth current_wealth, current_returns = env.step(funds) # record the wealth of all funds and the current price funds_wealth.append(current_wealth) funds_returns.append(current_returns) prices.append(env.p_t) # only update learning if learning fund is not bankrupt if learning_fund.is_active(): # we assume one learning fund for the moment next_state = learning_fund.get_state(env.p_t) reward = learning_fund.ret # experiment: high negative reward if learning_fund goes bankrupt #if learning_fund.activation_delay == 100: # reward = -100 # Keep track of the transition episode.append(Transition(state=state, action=demand, reward=reward, next_state=next_state, done=env.done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # Calculate TD Target value_next = critic.predict(next_state) td_target = reward + discount_factor * value_next td_error = td_target - critic.predict(state) # Update the value estimator critic.update(state, td_target) # Update the policy estimator # using the td error as our advantage estimate actor.update(state, td_error, demand) learning_fund_stats[i_episode][t] = np.array([env.p_t, demand, learning_fund.get_wealth(env.p_t), learning_fund.cash, learning_fund.shares, learning_fund.ret]) # Print out which step we're on, useful for debugging. print("\rt: {} @ Episode {}/{} ({})".format( t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end="") # env.done is True if one fund increases its wealth 50-fold #if env.done: # break state = next_state # After each episode, record the wealth of all funds funds_wealth_all_episodes.append(funds_wealth) funds_return_all_ep.append(funds_returns) # Save the variables to disk. checkpoint = "./checkpoints/{}-ep{}".format(experiment_name,i_episode) save_path = saver.save(sess,checkpoint) print("\nModel saved in path: {}\n".format(save_path)) return stats, funds_wealth_all_episodes, funds_return_all_ep, learning_fund_stats
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.0, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): ''' The function that will train our network to play Breakout Args: sess: tensorflow session env: game environment q_estimator: q-value network target_estimator: target network state_processor: ProcessState class which will process the states num_episodes: number of episodes to run for experiment_dir: Directory to save tensorflow summaries in replay_memory_size: size of the replay memory replay_memory_init_size: number of random experiences to sample when initializing the reply memory. update_target_estimator_every_: after these iterations upadate the network discount_factor: discount factor epsilon_start: starting value of the epsilon epsilon_end: ending value of epsilon epsilon_decay_steps: number of steps over which to decay the epsilon value batch_size: size of batch during training record_video_every: record video after these episodes Returns: a tuple with two numpy arrays one for episode_lengths and other for episode_rewards ''' Transition = namedtuple( 'Transition', ['state', 'action', 'reward', 'next_state', 'done']) # replay memory replay_memory = [] # make model copier estimator_copy = ModelParametersCopy(q_estimator, target_estimator) # keep statistics of the important stuff stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # for 'system/' summaries, usefull to chec if current process looks healthy current_process = psutil.Process() # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") # create directories if not exist if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) # saver saver = tf.train.Saver() # get to the current time step total_t = sess.run(tf.contrib.framework.get_global_step()) # epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # policy policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS)) # populate the replay memory with initial experience print("Populating replay memory...") # start new game state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) # run the loop to gte values for i in range(replay_memory_init_size): # get action probabilities action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps - 1)]) # select action randomly action = np.random.choice(np.arange(len(action_probs)), p=action_probs) # perform one step next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) # process the state next_state = state_processor.process(sess, next_state) # ?? next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) # add the state to replay memory replay_memory.append( Transition(state, action, reward, next_state, done)) # if done reset if done: state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) else: state = next_state # record videos env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, resume=True) # run the model for num_episodes for i_episode in range(num_episodes): # save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # reset the environment state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) loss = None # one step in the environment for t in itertools.count(): # epsilon at the current time step epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] # update the target network if required if total_t % update_target_estimator_every == 0: estimator_copy.make(sess) print("[!]Copying model parameters...\n") # print out which step we're on, helps in debugging print('[!]Step {} ({}) @ Episode {}/{}, loss:'.format( t, total_t, i_episode + 1, num_episodes, loss), end=" ") sys.stdout.flush() # take a step action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) # process the next state next_state = state_processor.process(sess, next_state) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) # if replay memory is full delete the first element if len(replay_memory) == replay_memory_size: del replay_memory[0] # Save the transition to replay memory replay_memory.append( Transition(state, actio, reward, next_state, done)) # update the statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # sample a minibatch from replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map( np.array, zip(*samples)) # calculate q_values and targets q_values_next = target_estimator.predict(sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype( np.float32) * discount_factor * np.argmax(q_values_next, axis=1) # perform gradient descent update states_batch = np.array(states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) # if done if done: break # else update the state and go on state = next_state total_t += 1 # add summaries to the tensorboard episode_summary = tf.Summary() # add values episode_summary.value.add(simple_value=epsilon, tag='episode/epsilon') episode_summary.value.add( simple_value=stats.episode_rewards[i_episode], tag='episode/reward') episode_summary.value.add( simple_value=state.episode_lengths[i_episode], tag='episode/length') episode_summary.value.add(simple_value=current_process.cpu_percent(), tag='system/cpu_usage_percent') episode_summary.value.add( simple_value=current_process.memory_percent(memtype='vms'), tag='system/v_memory_usage_percent') q_estimator.summary_writer.add_summary(episode_summary, i_episode) q_estimator.summary_write.flush() # yield yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode + 1], episode_rewards=stats.episode_rewards[:i_episode + 1]) return stats
def deep_q_learning(sess, env, q_estimator, target_estimator, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory replay_memory_init_size: Number of random experiences to sampel when initializing the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done epsilon_decay_steps: Number of steps to decay epsilon over batch_size: Size of batches to sample from the replay memory record_video_every: Record a video every N episodes Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) # The replay memory replay_memory = [] # Make model copier object estimator_copy = ModelParametersCopier(q_estimator, target_estimator) # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # For 'system/' summaries, usefull to check if currrent process looks healthy current_process = psutil.Process() # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) # Get the current time step total_t = sess.run(tf.contrib.framework.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS)) # Populate the replay memory with initial experience print("Populating replay memory...") state = env.reset() state = one_hot_encode_environment_state(state) # state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) for i in range(replay_memory_init_size): action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)]) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done = env.step(VALID_ACTIONS[action]) next_state = one_hot_encode_environment_state(next_state) # state_processor.process(sess, next_state) next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) replay_memory.append(Transition(state, action, reward, next_state, done)) if done: state = env.reset() state = one_hot_encode_environment_state(state) # state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) else: state = next_state # Record videos # Add env Monitor wrapper # env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, resume=True) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset() state = one_hot_encode_environment_state(state) # state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) loss = None # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps-1)] # Maybe update the target estimator if total_t % update_target_estimator_every == 0: estimator_copy.make(sess) print("\nCopied model parameters to target network.") # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() # Take a step action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done = env.step(VALID_ACTIONS[action]) next_state = one_hot_encode_environment_state(next_state) # state_processor.process(sess, next_state) next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element if len(replay_memory) == replay_memory_size: replay_memory.pop(0) # Save transition to replay memory replay_memory.append(Transition(state, action, reward, next_state, done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # Sample a minibatch from the replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples)) # Calculate q values and targets q_values_next = target_estimator.predict(sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1) # Perform gradient descent update states_batch = np.array(states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) if done: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="episode/epsilon") episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], tag="episode/reward") episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], tag="episode/length") episode_summary.value.add(simple_value=current_process.cpu_percent(), tag="system/cpu_usage_percent") episode_summary.value.add(simple_value=current_process.memory_percent(memtype="vms"), tag="system/v_memeory_usage_percent") q_estimator.summary_writer.add_summary(episode_summary, i_episode) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats(episode_lengths=stats.episode_lengths[:i_episode+1], episode_rewards=stats.episode_rewards[:i_episode+1]) return stats
def deep_q_learning(sess, env, q_estimator, target_estimator, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ DQN algorithm with fff-policy Temporal Differnce control returns EpisodeStats object with 2 numpy arrays for episode_lengths and episode_rewards """ Transition = namedtuple( "Transition", ["state", "action", "reward", "next_state", "done"]) replay_memory = [] # useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) # get current time step total_t = sess.run(tf.train.get_global_step()) # epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # q policy we are following policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS)) action_mapping = compose_action_from_id() # load initial experience into replay memory print("Populating replay memory...") state = env.reset() state, _, _ = env.step(actions.FunctionCall(_SELECT_ARMY, [_SELECT_ADD])) # make the minimap data the state. #state = state[0].observation["rgb_minimap"].astype(np.uint8) #state = state_processor.process(sess, state) #state = np.stack([state] * 4, axis=2) for i in range(replay_memory_init_size): if i % 1000 == 0: print("iteration " + str(i)) # according to policy, create a action probability array action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps - 1)]) # randomly select an action according to action probs from policy action = np.random.choice(np.arange(len(VALID_ACTIONS)), p=action_probs) # action = action_mapping[action] action = action_mapping[0] # openAI gym take a step in action space next_state, reward, done = env.step(action) # process image data #next_state = state_processor.process(sess, next_state) #next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) # add action to replay memory replay_memory.append( Transition(state, action, reward, next_state, done)) if done: # if found goal, start over state = env.reset() state, _, _ = env.step( actions.FunctionCall(_SELECT_ARMY, [_SELECT_ADD])) # make the minimap data the state. #state = state[0].observation["rgb_minimap"].astype(np.uint8) #state = state_processor.process(sess, state) #state = np.stack([state] * 4, axis=2) else: # if not found goal, update state to next state state = next_state # record videos # ad env monitor wrapper # Does this work for PySC2? #env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, # resume=True) for i_episode in range(num_episodes): # save the current checkpoint if i_episode % 100 == 0: print("episode: " + str(i_episode)) saver.save(tf.get_default_session(), checkpoint_path) # reset openAI environment state = env.reset() state, _, _ = env.step( actions.FunctionCall(_SELECT_ARMY, [_SELECT_ADD])) #state = state_processor.process(sess, state) #state = np.stack([state] * 4, axis=2) loss = None # main forloop after loading initial state for t in itertools.count(): # epsilon for this timestep epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] # add epsilon to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="epsilon") q_estimator.summary_writer.add_summary(episode_summary, total_t) # maybe update the target estimator # update means copying parameters from q estimator -> target estimator if total_t % update_target_estimator_every == 0: copy_model_parameters(sess, q_estimator, target_estimator) print("\nCopied model parameters to target network.") # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() # take the next step in the environment # similar to earlier when loading replay memory with first step action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(VALID_ACTIONS)), p=action_probs) action = action_mapping[action] action = action_mapping[0] next_state, reward, done = env.step(action) #next_state = state_processor.process(sess, next_state) #next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) # if replay memory is full, pop if len(replay_memory) == replay_memory_size: replay_memory.pop(0) # save transition to replay memory replay_memory.append( Transition(state, action, reward, next_state, done)) # update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # sample minibatch from replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map( np.array, zip(*samples)) # calculate qvalues and targets # Q ALGO RIGHT HERE LMAO q_values_next = target_estimator.predict(sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype( np.float32) * discount_factor * np.max(q_values_next, axis=1) # gradient descent states_batch = np.array(states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) if done: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") q_estimator.summary_writer.add_summary(episode_summary, total_t) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode + 1], episode_rewards=stats.episode_rewards[:i_episode + 1]) return stats
def reinforce(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0): """ REINFORCE (Monte Carlo Policy Gradient) Algorithm. Optimizes the policy function approximator using policy gradient. Args: env: OpenAI environment. estimator_policy: Policy Function to be optimized estimator_value: Value function approximator, used as a baseline num_episodes: Number of episodes to run for discount_factor: Time-discount factor Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) Transition = collections.namedtuple( "Transition", ["state", "action", "reward", "next_state", "done"]) for i_episode in range(num_episodes): # Reset the environment and pick the fisrst action state = env.reset() episode = [] # One step in the environment for t in itertools.count(): # Take a step #action_means = np.ndarray.flatten(estimator_policy.predict(state)) #action = np.random.multivariate_normal(mean=action_means, cov=full_var) action = estimator_policy.predict(state) ''' max_idx = np.argmax(np.abs(action)) a_max = action[max_idx] if a_max > high_threshold or a_max < low_threshold: action_clipped = action / (10*np.abs(a_max)) ''' ''' action_clipped = [np.max([np.min([action[0], high_threshold]), low_threshold]), np.max([np.min([action[1], high_threshold]), low_threshold])] ''' next_state, reward, done, _ = env.step(action) ''' if t > 50: done = True ''' # Keep track of the transition episode.append( Transition(state=state, action=action, reward=reward, next_state=next_state, done=done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t ''' # Print out which step we're on, useful for debugging. print("\rStep {} @ Episode {}/{} ({})".format( t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end="") # sys.stdout.flush() ''' if done: break state = next_state monitor_epoch = 50 if i_episode % monitor_epoch == 0 and i_episode > 0: print("avg reward : %f" % (np.mean( stats.episode_rewards[i_episode - monitor_epoch:i_episode]))) baseline_value = np.mean([cur_trans.reward for cur_trans in episode]) # Go through the episode and make policy updates for t, transition in enumerate(episode): # The return after this timestep total_return = sum(discount_factor**i * t.reward for i, t in enumerate(episode[t:])) advantage = total_return - baseline_value #advantage += np.max([0, baseline_value - v_prev]) # Update our policy estimator estimator_policy.update(transition.state, advantage, transition.action) #print(p_action) if i_episode % 200 == 0 and i_episode > 0: plt.figure(1) plt.plot(transition.state[0], transition.state[1], 'bo') if t == len(episode) - 1: plt.show() return stats