class Runner: def __init__(self, env, model, num_steps, discount_rate, summary_frequency, performance_num_episodes, summary_log_dir): self.env = env self.model = model self.discount_rate = discount_rate self.observation = env.reset() self.num_steps = num_steps self.stats_recorder = StatsRecorder( summary_frequency=summary_frequency, performance_num_episodes=performance_num_episodes, summary_log_dir=summary_log_dir, save=True) self.viewer = SimpleImageViewer() def render(self): columns = [] for i in range(80): rows = [] for j in range(80): if self.observation[i][j] == 1: rows.append([255, 255, 255]) else: rows.append([0, 0, 0]) columns.append(rows) self.viewer.imshow(np.asarray(columns, dtype=np.uint8)) def run(self): observations = [] rewards = [] actions = [] terminals = [] values = [] for _ in range(self.num_steps): action_index, value = self.model.predict([self.observation]) observations.append(self.observation) action = action_with_index(action_index) values.append(value) self.observation, reward, terminal = self.env.step(action) self.stats_recorder.after_step(reward=reward, terminal=terminal) rewards.append(reward) actions.append(action_index) terminals.append(terminal) if terminal: self.observation = self.env.reset() if terminals[-1] == 0: next_value = self.model.predict_value([self.observation])[0] discounted_rewards = discount(rewards + [next_value], terminals + [False], self.discount_rate)[:-1] else: discounted_rewards = discount(rewards, terminals, self.discount_rate) self.model.train(observations, discounted_rewards, actions, values)
class Runner(): def __init__(self, env, model, batch_size, timesteps, discount_rate, summary_frequency, performance_num_episodes, summary_log_dir): self.env = env self.model = model self.timesteps = timesteps self.discount_rate = discount_rate self.observation = env.reset() self.batch_size = batch_size self.stats_recorder = StatsRecorder(summary_frequency=summary_frequency, performance_num_episodes=performance_num_episodes, summary_log_dir=summary_log_dir, save=True) def run(self): batch_observations = [] batch_rewards = [] batch_actions = [] batch_dones = [] for t in range(self.timesteps+1): action_index = self.model.predict_action([self.observation])[0] batch_observations.append(self.observation) action = action_with_index(action_index) self.observation, reward, done, info = self.env.step(action) if t % self.stats_recorder.summary_frequency == 0: print(info["starting_point"]) self.stats_recorder.after_step(reward=reward, done=done, t=t) batch_rewards.append(reward) batch_actions.append(action_index) batch_dones.append(done) if len(batch_rewards) == self.batch_size: discounted_reward = discount(batch_rewards, batch_dones, self.discount_rate) self.model.train(batch_observations, discounted_reward, batch_actions) batch_observations = [] batch_rewards = [] batch_actions = [] batch_dones = [] if done: self.observation = self.env.reset() if t % self.stats_recorder.summary_frequency == 0: self.model.save(0)
def __init__(self, env, model, num_steps, discount_rate, summary_frequency, performance_num_episodes, summary_log_dir): self.env = env self.model = model self.discount_rate = discount_rate self.observation = env.reset() self.num_steps = num_steps self.stats_recorder = StatsRecorder( summary_frequency=summary_frequency, performance_num_episodes=performance_num_episodes, summary_log_dir=summary_log_dir, save=True) self.viewer = SimpleImageViewer()
def __init__(self, env, model, batch_size, discount_rate, summary_log_dir, summary_frequency, performance_num_episodes): self.env = env self.batch_size = batch_size self.discount_rate = discount_rate self.model = model self.file_writer = SummaryWriter(summary_log_dir) self.save_summary_steps = summary_frequency self.performance_num_episodes = performance_num_episodes self.observation, self.available_actions_mask = self.env.reset() self.stats_recorder = StatsRecorder( summary_frequency=summary_frequency, performance_num_episodes=performance_num_episodes, summary_log_dir=summary_log_dir, save=True)
class Runner: def __init__(self, env, model, batch_size, discount_rate, summary_frequency, performance_num_episodes, summary_log_dir ): self.env = env self.model = model self.observation = env.reset() self.batch_size = batch_size self.states = model.initial_state self.terminal = False self.discount_rate = discount_rate self.stats_recorder = StatsRecorder(summary_frequency=summary_frequency, performance_num_episodes=performance_num_episodes, summary_log_dir=summary_log_dir, save=True) def run(self): observations, batch_rewards, actions, terminals = [],[],[],[] states = self.states for n in range(self.batch_size): action_index, self.states = self.model.predict_action([self.observation], self.states, [self.terminal]) action = action_with_index(action_index) observations.append(self.observation) actions.append(action_index) terminals.append(self.terminal) self.observation, reward, self.terminal = self.env.step(action) self.stats_recorder.after_step(reward=reward, terminal=self.terminal) if self.terminal: self.observation = self.env.reset() batch_rewards.append(reward) terminals.append(self.terminal) discounted_rewards = discount(batch_rewards, terminals[1:], self.discount_rate) return observations, states, discounted_rewards, terminals[:-1], actions
def __init__(self, env, model, num_steps, advantage_estimator_gamma, advantage_estimator_lambda, summary_frequency, performance_num_episodes, summary_log_dir): self.gae_lambda = advantage_estimator_lambda self.gae_gamma = advantage_estimator_gamma self.stats_recorder = StatsRecorder( summary_frequency=summary_frequency, performance_num_episodes=performance_num_episodes, summary_log_dir=summary_log_dir, save=True) self.env = env self.model = model self.observation = env.reset() self.num_steps = num_steps self.terminal = False self.rewards = [] self.values = [] self.advantage_estimation = 0
def __init__(self, env, model, batch_size, timesteps, discount_rate, summary_frequency, performance_num_episodes, summary_log_dir): self.env = env self.model = model self.timesteps = timesteps self.discount_rate = discount_rate self.observation = env.reset() self.batch_size = batch_size self.stats_recorder = StatsRecorder(summary_frequency=summary_frequency, performance_num_episodes=performance_num_episodes, summary_log_dir=summary_log_dir, save=True)
def __init__(self, env, model, num_steps, advantage_estimator_gamma, advantage_estimator_lambda, summary_frequency, performance_num_episodes, summary_log_dir): self.env = env self.model = model self.file_writer = SummaryWriter(summary_log_dir) self.performance_num_episodes = performance_num_episodes self.observation, self.available_actions_mask = self.env.reset() self.stats_recorder = StatsRecorder( summary_frequency=summary_frequency, performance_num_episodes=performance_num_episodes, summary_log_dir=summary_log_dir, save=True) self.gae_gamma = advantage_estimator_gamma self.gae_lambda = advantage_estimator_lambda self.terminal = False self.num_steps = num_steps self.advantage_estimation = 0
def main(): global_seed(0) discount_rate = 0.99 env = init_env() model = Model(observation_space=16, action_space=4, learning_rate=0.1, discount_rate=discount_rate) stats_recorder = StatsRecorder(summary_frequency=10000, performance_num_episodes=100) observations = [] rewards = [] actions = [] timesteps = 100000 observation = env.reset() for t in range(timesteps): action = model.predict_action(observation) observations.append(observation) observation, reward, done, _ = env.step(action) rewards.append(reward) actions.append(action) stats_recorder.after_step(reward=reward, done=done, t=t) if done: i = 0 for observation, reward, action in zip(observations, rewards, actions): discounted_reward = discount(rewards, discount_rate, i) model.train(observation, discounted_reward, action, i) i += 1 observations = [] rewards = [] actions = [] observation = env.reset()
def main(): raise NotImplementedError # Actor Critic algorithm not working yet global_seed(0) discount_rate = 0.99 env = init_env() model = Model(observation_space=16, action_space=4, learning_rate=0.01) stats_recorder = StatsRecorder(summary_frequency=10000, performance_num_episodes=1000) I = 1 observation = env.reset() last_value = model.predict_value(observation) timesteps = 100000 for t in range(timesteps): action = model.predict_action(observation) # S_t next_observation, reward, done, _ = env.step(action) # S_t+1, R_t+1 stats_recorder.after_step(reward=reward, done=done, t=t) value = model.predict_value(next_observation) # v(S_t+1) td_target = reward + discount_rate * value td_error = td_target - last_value model.train(observation, td_error, action, I) observation = next_observation last_value = value I = discount_rate * I if done: I = 1 observation = env.reset() last_value = model.predict_value(observation)
class Runner: def __init__(self, env, model, batch_size, timesteps, discount_rate, summary_frequency, performance_num_episodes, summary_log_dir): self.env = env self.model = model self.timesteps = timesteps self.discount_rate = discount_rate self.observation = env.reset() self.batch_size = batch_size self.stats_recorder = StatsRecorder( summary_frequency=summary_frequency, performance_num_episodes=performance_num_episodes, summary_log_dir=summary_log_dir, save=True) self.viewer = SimpleImageViewer() def render(self): columns = [] for i in range(80): rows = [] for j in range(80): if self.observation[i][j] == 1: rows.append([255, 255, 255]) else: rows.append([0, 0, 0]) columns.append(rows) self.viewer.imshow(np.asarray(columns, dtype=np.uint8)) def run(self): observations = [] rewards = [] actions = [] terminals = [] for t in range(self.timesteps + 1): action_index = self.model.predict_action([self.observation]) observations.append(self.observation) action = action_with_index(action_index) self.observation, reward, terminal = self.env.step(action) self.stats_recorder.after_step(reward=reward, terminal=terminal) rewards.append(reward) actions.append(action_index) terminals.append(terminal) if len(rewards) == self.batch_size: discounted_rewards = discount(rewards, terminals, self.discount_rate) self.model.train(observations, discounted_rewards, actions) observations = [] rewards = [] actions = [] terminals = [] if terminal: self.observation = self.env.reset() if t % self.stats_recorder.summary_frequency == 0: self.model.save(0)
class Runner: def __init__(self, env, model, num_steps, advantage_estimator_gamma, advantage_estimator_lambda, summary_frequency, performance_num_episodes, summary_log_dir): self.env = env self.model = model self.file_writer = SummaryWriter(summary_log_dir) self.performance_num_episodes = performance_num_episodes self.observation, self.available_actions_mask = self.env.reset() self.stats_recorder = StatsRecorder( summary_frequency=summary_frequency, performance_num_episodes=performance_num_episodes, summary_log_dir=summary_log_dir, save=True) self.gae_gamma = advantage_estimator_gamma self.gae_lambda = advantage_estimator_lambda self.terminal = False self.num_steps = num_steps self.advantage_estimation = 0 def estimate_advantage(self, t, terminal, next_value): if terminal: delta = self.rewards[t] + self.values[t] return delta else: delta = self.rewards[ t] + self.gae_gamma * next_value - self.values[t] return delta + self.gae_gamma * self.gae_lambda * self.advantage_estimation def index_to_2d(self, action_spatial): position = np.unravel_index(action_spatial, self.model.spatial_resolution) if position[0] == 0: x = 0 else: x = (position[0] * (self.env.observation_space[0] / (self.model.spatial_resolution[0] - 1))) - 1 if position[1] == 0: y = 0 else: y = (position[1] * (self.env.observation_space[0] / (self.model.spatial_resolution[1] - 1))) - 1 return x, y def make_action_function(self, action, args): return actions.FunctionCall(action.id, args) def make_action(self, action_id, spatial_index): action = self.env.actions[action_id] if action == actions.FUNCTIONS.select_army: return actions.FUNCTIONS.select_army("select"), False elif action == actions.FUNCTIONS.Move_screen: x, y = self.index_to_2d(spatial_index) return self.make_action_function(actions.FUNCTIONS.Move_screen, [[0], [x, y]]), True else: raise NotImplementedError def run(self): observations = [] self.rewards = [] actions = [] actions_spatial = [] actions_spatial_mask = [] available_actions = [] batch_dones = [] self.values = [] probs_spatial = [] probs = [] for _ in range(self.num_steps): observations.append(self.observation) action_ids, spatial_indexes, value, prob, prob_spatial = self.model.predict( np.asarray([self.observation]).swapaxes(0, 1), [self.available_actions_mask]) self.values.append(value) probs.append(prob) probs_spatial.append(prob_spatial) batch_dones.append(self.terminal) action, spatial_mask = self.make_action(action_ids[0], spatial_indexes[0]) actions.append(action_ids[0]) actions_spatial.append(spatial_indexes[0]) actions_spatial_mask.append(spatial_mask) available_actions.append(self.available_actions_mask) self.observation, reward, self.terminal, self.available_actions_mask = self.env.step( action) self.stats_recorder.after_step(reward=reward, terminal=self.terminal) self.rewards.append(reward) advantage_estimations = np.zeros_like(self.rewards) last_value = self.model.predict_value(self.observation)[0] for t in reversed(range(self.num_steps)): if t == self.num_steps - 1: self.advantage_estimation = self.estimate_advantage( t, self.terminal, last_value) else: self.advantage_estimation = self.estimate_advantage( t, batch_dones[t + 1], self.values[t + 1]) advantage_estimations[t] = self.advantage_estimation observations = np.asarray(observations).swapaxes(0, 1) return observations, \ actions, \ available_actions, \ actions_spatial, \ actions_spatial_mask,\ advantage_estimations,\ self.values,\ probs,\ probs_spatial
class Runner: def __init__(self, env, model, batch_size, discount_rate, summary_log_dir, summary_frequency, performance_num_episodes): self.env = env self.batch_size = batch_size self.discount_rate = discount_rate self.model = model self.file_writer = SummaryWriter(summary_log_dir) self.save_summary_steps = summary_frequency self.performance_num_episodes = performance_num_episodes self.observation, self.available_actions_mask = self.env.reset() self.stats_recorder = StatsRecorder( summary_frequency=summary_frequency, performance_num_episodes=performance_num_episodes, summary_log_dir=summary_log_dir, save=True) def discount(self, rewards, terminals, discount_rate): discounted = [] total_return = 0 for reward, terminal in zip(rewards[::-1], terminals[::-1]): if terminal: total_return = reward else: total_return = reward + discount_rate * total_return discounted.append(total_return) return np.asarray(discounted[::-1]) def index_to_2d(self, action_spatial): position = np.unravel_index(action_spatial, self.model.spatial_resolution) if position[0] == 0: x = 0 else: x = (position[0] * (self.env.observation_space[0] / (self.model.spatial_resolution[0] - 1))) - 1 if position[1] == 0: y = 0 else: y = (position[1] * (self.env.observation_space[0] / (self.model.spatial_resolution[1] - 1))) - 1 return x, y def make_action_function(self, action, args): return actions.FunctionCall(action.id, args) def make_action(self, action_id, spatial_index): action = self.env.actions[action_id] if action == actions.FUNCTIONS.select_army: return actions.FUNCTIONS.select_army("select"), False elif action == actions.FUNCTIONS.Move_screen: x, y = self.index_to_2d(spatial_index) return self.make_action_function(actions.FUNCTIONS.Move_screen, [[0], [x, y]]), True else: raise NotImplementedError def run(self): observations = [] rewards = [] actions = [] actions_spatial = [] actions_spatial_mask = [] available_action_masks = [] terminals = [] for _ in range(self.batch_size): observations.append(self.observation) action_ids, spatial_indexes = self.model.predict( np.asarray([self.observation]).swapaxes(0, 1), [self.available_actions_mask]) action, spatial_mask = self.make_action(action_ids[0], spatial_indexes[0]) actions.append(action_ids[0]) actions_spatial.append(spatial_indexes[0]) actions_spatial_mask.append(spatial_mask) available_action_masks.append(self.available_actions_mask) self.observation, reward, terminal, self.available_actions_mask = self.env.step( action) self.stats_recorder.after_step(reward=reward, terminal=terminal) rewards.append(reward) terminals.append(terminal) rewards = self.discount(rewards, terminals, self.discount_rate) observations = np.asarray(observations).swapaxes(0, 1) self.model.train(observations=observations, actions=actions, available_actions_masks=available_action_masks, actions_spatial=actions_spatial, actions_spatial_masks=actions_spatial_mask, rewards=rewards)
class Runner: def __init__(self, env, model, num_steps, advantage_estimator_gamma, advantage_estimator_lambda, summary_frequency, performance_num_episodes, summary_log_dir): self.gae_lambda = advantage_estimator_lambda self.gae_gamma = advantage_estimator_gamma self.stats_recorder = StatsRecorder( summary_frequency=summary_frequency, performance_num_episodes=performance_num_episodes, summary_log_dir=summary_log_dir, save=True) self.env = env self.model = model self.observation = env.reset() self.num_steps = num_steps self.terminal = False self.rewards = [] self.values = [] self.advantage_estimation = 0 def estimate_advantage(self, t, terminal, next_value): if terminal: delta = self.rewards[t] + self.values[t] return delta else: delta = self.rewards[ t] + self.gae_gamma * next_value - self.values[t] return delta + self.gae_gamma * self.gae_lambda * self.advantage_estimation def run(self): observations, actions, terminals, log_probs = [], [], [], [] self.rewards = [] self.values = [] for _ in range(self.num_steps): action_index, value, log_prob = self.model.predict( self.observation) observations.append(self.observation) actions.append(action_index) self.values.append(value) log_probs.append(log_prob) terminals.append(self.terminal) action = action_with_index(action_index) self.observation, reward, self.terminal = self.env.step(action) if self.terminal: self.observation = self.env.reset() self.stats_recorder.after_step(reward, self.terminal) self.rewards.append(reward) actions = np.asarray(actions) self.values = np.asarray(self.values) log_probs = np.asarray(log_probs) last_value = self.model.predict_value(self.observation) advantage_estimations = np.zeros_like(self.rewards) self.advantage_estimation = 0 for t in reversed(range(self.num_steps)): if t == self.num_steps - 1: self.advantage_estimation = self.estimate_advantage( t, self.terminal, last_value) else: self.advantage_estimation = self.estimate_advantage( t, terminals[t + 1], self.values[t + 1]) advantage_estimations[t] = self.advantage_estimation return np.asarray( observations ), advantage_estimations, terminals, actions, self.values, log_probs