class GoalController(object): def __init__(self, state_dim, action_bound=1.0, final_activation=tf.identity, training_batch_size=32, GAMMA=0.95, lr=0.001, replay_buffer_size=1024): self.AC = ActorCritic(state_dim, state_dim, final_activation=final_activation, action_bound=action_bound, training_batch_size=training_batch_size, GAMMA=GAMMA, lr=lr, replay_buffer_size=replay_buffer_size) def add_to_replay_buffer(self, state, goal_state, reward, resulting_state): # Here, reward means exactly what it sounds like it does... self.AC.add_to_replay_buffer(state, goal_state, reward, resulting_state) def add_batch_to_replay_buffer(self, states, goal_states, rewards, resulting_states): for s, gs, r, rs in zip(states, goal_states, rewards, resulting_states): self.AC.add_to_replay_buffer(s, gs, r, rs) def train_from_replay_buffer(self): self.AC.train_from_replay_buffer() def get_goal_state(self, current_states): return self.AC.get_actions(current_states)
class StateController(object): def __init__(self, state_dim, action_dim, action_bound=0.4, training_batch_size=32, GAMMA=0.95, lr=0.001, replay_buffer_size=1024): new_state_dim = 2 * state_dim self.state_dim = state_dim self.AC = ActorCritic( new_state_dim, action_dim, action_bound=action_bound, training_batch_size=training_batch_size, GAMMA=GAMMA, lr=lr, replay_buffer_size=replay_buffer_size) def get_reward(self, resulting_state, goal_state): return np.sum(((resulting_state - goal_state)**2), 1) def add_to_replay_buffer(self, state, goal_state, action, resulting_state): combined_state = np.concatenate( state, goal_state) #combined is state plus goal reward = self.get_reward(resulting_state, goal_state) # But reward is result - goal real_resulting_state = np.concatenate(resulting_state, goal_state) self.AC.add_to_replay_buffer(combined_state, action, reward, real_resulting_state) def add_batch_to_replay_buffer(self, states, goal_states, actions, resulting_states): for s, gs, a, rs in zip(states, goal_states, actions, rewards, resulting_states): self.AC.add_to_replay_buffer(s, gs, a, rs) def train_from_replay_buffer(self): self.AC.train_from_replay_buffer() def get_actions(self, states, goal_states): combined_states = np.concatenate((states, goal_states), 1) return self.AC.get_actions(combined_states) def get_random_visited_state(self): return self.AC.get_batch(1)[0][0][0:self.state_dim]
class Runner(object): def __init__(self, env, GOAL_STATE, GAMMA=0.95, lr=0.001): self.env = env self.GOAL_STATE = GOAL_STATE self.states_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] self.actor_critic = ActorCritic( self.states_dim, self.action_dim, GAMMA=GAMMA, lr=lr) self.min_spread_holder = MinSpreadHolder(self.states_dim) def render_if_true(self, render): if render: self.env.render() def get_reward(self, state): shifted_goal_state = self.shift_observation(self.GOAL_STATE) diff = state - shifted_goal_state reward = -1 * np.mean(np.multiply(diff, diff)) return reward def add_observed_batch(self, obs_batch): self.min_spread_holder.add_batch(obs_batch) def shift_observation(self, obs): return self.min_spread_holder.transform(obs) def play_random_game(self, render=True, add_to_all_observations=False): env = self.env observation = env.reset() games_observations = [] for t in range(1000): games_observations.append(observation) self.render_if_true(render) action = env.action_space.sample() observation, reward, done, info = env.step(action) if done: if add_to_all_observations: self.add_observed_batch(np.asarray(games_observations)) print('Episode finished after {} timesteps'.format(t + 1)) break def play_game_from_actor_with_random(self, render=True, add_to_buffer=True, prob_random=0.0): games_observations = [] env = self.env obs = env.reset() games_observations = [] for t in range(1000): self.render_if_true(render) obs = np.asarray(obs) games_observations.append(obs) shifted_obs = self.shift_observation(obs) action = self.actor_critic.get_actions( np.asarray([shifted_obs]))[0] # I think zero. if not render and (random.random() < prob_random): action = env.action_space.sample() # if not render: # for i in range(len(action)): # if random.random() < prob_random: # action[i] = (random.random() * 0.8) - 0.4 new_obs, reward, done, info = env.step(action) shifted_new_obs = self.shift_observation(new_obs) if add_to_buffer: # real_reward = 0.0 if not done else -1.0 real_reward = self.get_reward( shifted_new_obs) if not done else -2.0 self.actor_critic.add_to_replay_buffer( shifted_obs, action, real_reward, shifted_new_obs) if done: self.add_observed_batch(np.asarray(games_observations)) print('Episode finished after {} timesteps'.format(t + 1)) break obs = new_obs def train_from_replay_buffer(self, should_print): losses = self.actor_critic.train_from_replay_buffer(should_print) return np.mean(losses)
class Runner(object): def __init__(self, env, GAMMA=0.5): self.env = env self.states_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] self.actor_critic = ActorCritic(self.states_dim, self.action_dim, lr=0.0000000001) self.all_observations = np.asarray([]) def get_means_stddevs(self, num_games=100, min_std_dev=0.01): observations = [] env = self.env for i in xrange(num_games): obs = env.reset() while True: observations.append(obs) action = env.action_space.sample() obs, reward, done, info = env.step(action) if done: print('game {} done'.format(i)) break observations = np.asarray(observations) mean = np.mean(observations, axis=0) stddev = np.maximum(np.std(observations, axis=0), min_std_dev) return mean, stddev def write_mean_stddev_to_file(self, num_games=100, min_std_dev=0.01): mean, stddev = self.get_means_stddevs(num_games, min_std_dev) with open('./mujoco_data/mean_state.json', 'w') as f: f.write(json.dumps(mean.tolist())) with open('./mujoco_data/stddev_state.json', 'w') as f: f.write(json.dumps(stddev.tolist())) print('written') def get_min_spread(self, num_games=100, min_spread=0.05): observations = [] env = self.env for i in xrange(num_games): obs = env.reset() while True: observations.append(obs) action = env.action_space.sample() obs, reward, done, info = env.step(action) if done: print('game {} done'.format(i)) break observations = np.asarray(observations) min_obs = observations.min(axis=0) max_obs = observations.max(axis=0) spread = np.maximum(max_obs - min_obs, min_spread) return min_obs, spread def write_min_spread_to_file(self, num_games=100, min_spread=0.05): min_obs, spread = self.get_min_spread(num_games, min_spread) print(min_obs) print(spread) print(min_obs.shape, spread.shape) with open('./mujoco_data/min_state.json', 'w') as f: f.write(json.dumps(min_obs.tolist())) with open('./mujoco_data/spread_state.json', 'w') as f: f.write(json.dumps(spread.tolist())) print('written') def play_random_game(self, render=True): env = self.env observation = env.reset() for t in range(1000): if render == True: env.render() action = env.action_space.sample() observation, reward, done, info = env.step(action) if done: print('Episode finished after {} timesteps'.format(t + 1)) break def play_game_from_actor(self, render=True, add_to_buffer=True): env = self.env obs = env.reset() for t in range(1000): if render == True: env.render() sleep(0.05) obs = np.asarray(obs) shifted_obs = shift_state(obs) action = self.actor_critic.get_actions(np.asarray( [shifted_obs]))[0] # I think zero. new_obs, reward, done, info = env.step(action) if done: print('Episode finished after {} timesteps'.format(t + 1)) break if add_to_buffer: shifted_new_obs = shift_state(new_obs) # real_reward = get_reward(shifted_obs, shifted_new_obs) real_reward = get_reward(shifted_new_obs) self.actor_critic.add_to_replay_buffer(shifted_obs, action, real_reward, shifted_new_obs) obs = new_obs def play_game_from_actor_with_random(self, render=True, add_to_buffer=True, prob_random=0.05): env = self.env obs = env.reset() for t in range(1000): if render == True: env.render() sleep(0.01) obs = np.asarray(obs) shifted_obs = shift_state(obs) action = self.actor_critic.get_actions(np.asarray( [shifted_obs]))[0] # I think zero. if not render: for i in range(len(action)): if random.random() < prob_random: action[i] = (random.random() * 0.8) - 0.4 # random_move = random.random() < prob_random # if random_move and not render: # print('Random move!') # action = env.action_space.sample() # else: # action = self.actor_critic.get_actions( # np.asarray([shifted_obs]))[0] # I think zero. new_obs, reward, done, info = env.step(action) if done: print obs, '\n' print new_obs, '\n' print shifted_obs, '\n' exit() if add_to_buffer: real_reward = -0.10 self.actor_critic.add_to_replay_buffer( shifted_obs, action, real_reward, shifted_obs) print('Episode finished after {} timesteps'.format(t + 1)) break if add_to_buffer: shifted_new_obs = shift_state(new_obs) # real_reward = get_reward(shifted_obs, shifted_new_obs) real_reward = get_reward(shifted_new_obs) self.actor_critic.add_to_replay_buffer(shifted_obs, action, real_reward, new_obs) obs = new_obs def train_from_replay_buffer(self, should_print): losses = self.actor_critic.train_from_replay_buffer(should_print) return np.mean(losses)