def __init__(self, frame_size, skip_frames, stack_size): self.env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0') self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) self.agent = None self.frame_size = frame_size self.stack_size = stack_size self.action_size = self.env.action_space.n self.skip_frames = skip_frames self.render = False self.state_generator = StateGenerator(self.frame_size, self.stack_size) self.env.reset() raw_state, _, _, self.info = self.env.step(0) self.state = self.state_generator.get_stacked_frames(raw_state, True) self.states = [] self.policies = [] self.actions = [] self.rewards = [] self.values = [] self.dones = [] self.episode = 0 self.episodeReward = 0 self.maxEpisodeReward = 0 self.current_episode_reward = 0 self.done = False
def solver_DFS(self): if self.capacity >= sum(weights): return (StateGenerator(0, [1]*len(self.prices), self.prices, self.weights)) max_index = len(self.prices) # the len of items is constant, I don't want to calculate it in every loop root_state = StateGenerator(0, [0] * len(self.prices), self.prices, self.weights) best_state = root_state stack = [root_state] counter = 0 while len(stack) > 0: current_state = stack.pop() index = current_state.index if current_state.value > best_state.value: best_state = current_state if index < max_index: if current_state.weight + self.items[index][1] <= self.capacity: # If possible take in the item next_taken = current_state.get_NextState() stack.append(next_taken) next_nonTaken = StateGenerator(index+1, current_state.taken, self.prices, self.weights) stack.append(next_nonTaken) # anyway, try the next state, without taking the item counter += 1 print(counter) return best_state
def solver_BFS(self): # the only difference is that I always take the first item and remove it of the queue if self.capacity >= sum(weights): return (StateGenerator(0, [1]*len(self.prices), self.prices, self.weights)) max_index = len(self.prices) # the len of items is constant, I don't want to calculate it in every loop root_state = StateGenerator(0, [0] * len(self.prices), self.prices, self.weights) best_state = root_state queue = [root_state] while len(queue) > 0: current_state = queue.pop(0) if current_state.value > best_state.value: best_state = current_state index = current_state.index if index < max_index: if current_state.weight + self.items[index][1] <= self.capacity: # If possible take in the item next_taken = current_state.get_NextState() queue.append(next_taken) next_nonTaken = StateGenerator(index+1, current_state.taken, self.prices, self.weights) queue.append(next_nonTaken) # anyway, try the next state, without taking the item return best_state
def create_graph(): init_logging() generator = StateGenerator() states, paths = generator.generate() pprint.pprint(states) pprint.pprint(paths) graph = Graph(states, paths) graph.render_graph()
class TestStateGenerator(unittest.TestCase): def setUp(self): prices = [5, 10, 25] weights = [1, 2, 3] wrong_prices = [1, 2] self.empty_state = StateGenerator(0, [0] * len(prices), prices, weights) # empty beginning state self.end_state = StateGenerator( 2, [0, 0, 1], prices, weights) # state which can't be developed self.state1 = StateGenerator( 1, [0, 1, 0], prices, weights) # common state with already explored index self.a = StateGenerator(0, [0] * len(prices), prices, weights) def test_get_NextState_true(self): self.assertEqual([1, 0, 0], self.empty_state.get_NextState().taken) self.assertEqual(1, self.empty_state.get_NextState().index) self.assertEqual([0, 1, 1], self.state1.get_NextState().taken) self.empty_state.get_NextState( ) # checking if our start state became unchanged self.assertEqual(0, self.empty_state.index) def test_get_NextState_None(self): self.assertEqual( None, self.empty_state.get_NextState().get_NextState().get_NextState(). get_NextState()) self.assertEqual(None, self.state1.get_NextState().get_NextState()) self.assertEqual(None, self.end_state.get_NextState())
def solver_optimized(self): if self.capacity >= sum(self.weights): return(StateGenerator(0, [1]*len(self.items), self.prices, self.weights)) max_index = len(self.prices) # the len of items is constant, I don't want to calculate it in every loop root_state = StateGenerator(0, [0] * len(self.prices), self.prices, self.weights) best_state = root_state stack = [root_state] counter = 0 # keeping track of the number of states I had to traversed, I have it here just to se if the # optimalization has en effect on solving the problem while len(stack) > 0: current_state = stack.pop() index = current_state.index if current_state.value > best_state.value: best_state = current_state if index < max_index: if current_state.weight + self.items[index][1] <= self.capacity: # If possible take in the item next_taken = current_state.get_NextState() if self.upper_bound(index + 1, next_taken.value, next_taken.weight) > best_state.value: stack.append(next_taken) # continue with developing this state only if it has a higher estimation # than best_state's value if next_taken.value > best_state.value: best_state = next_taken next_nonTaken = StateGenerator(index+1, current_state.taken, self.prices, self.weights) # anyway, try the state, where you don't take in the item and its upper bound if self.upper_bound(index + 1, next_nonTaken.value, next_nonTaken.weight) > best_state.value: stack.append(next_nonTaken) counter += 1 print(counter) return best_state
def setUp(self): prices = [5, 10, 25] weights = [1, 2, 3] wrong_prices = [1, 2] self.empty_state = StateGenerator(0, [0] * len(prices), prices, weights) # empty beginning state self.end_state = StateGenerator( 2, [0, 0, 1], prices, weights) # state which can't be developed self.state1 = StateGenerator( 1, [0, 1, 0], prices, weights) # common state with already explored index self.a = StateGenerator(0, [0] * len(prices), prices, weights)
from memory_db import MemoryDB from state_generator import StateGenerator from training_parameters import esplison, esplison_decay, gamma, input_size, frame_size, stack_size, max_steps, render, max_episodes, sample_size, epoch, esplison, esplison_decay, experiences_before_training, training_before_update_target, e, a, beta, beta_increment_per_sampling, capacity, max_priority if __name__ == "__main__": env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) print(SIMPLE_MOVEMENT, env.action_space.n) # get size of state and action from environment state_size = env.observation_space.shape action_size = env.action_space.n memorydb_instance = MemoryDB(e, a, beta, beta_increment_per_sampling, capacity, max_priority) agent_instance = DQNAgent(input_size, action_size, esplison, esplison_decay, True) state_generator_instance = StateGenerator(frame_size, stack_size) scores, episodes = [], [] for e in range(max_episodes): done = False score = 0 raw_state = env.reset() state = state_generator_instance.get_stacked_frames(raw_state, True) steps = 0 # up to 500 while not done and steps < max_steps: if render: # if True env.render() steps += 1 # get e greedy action
class EnvWrapper(): def __init__(self, frame_size, skip_frames, stack_size): self.env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0') self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) self.agent = None self.frame_size = frame_size self.stack_size = stack_size self.action_size = self.env.action_space.n self.skip_frames = skip_frames self.render = False self.state_generator = StateGenerator(self.frame_size, self.stack_size) self.env.reset() raw_state, _, _, self.info = self.env.step(0) self.state = self.state_generator.get_stacked_frames(raw_state, True) self.states = [] self.policies = [] self.actions = [] self.rewards = [] self.values = [] self.dones = [] self.episode = 0 self.episodeReward = 0 self.maxEpisodeReward = 0 self.current_episode_reward = 0 self.done = False def step(self, n): for _ in range(n): policy, value = self.agent.get_actions_and_values( np.array([self.state])) action = np.random.choice(self.action_size, p=np.squeeze(policy)) reward = 0 for i in range(0, self.skip_frames): raw_state, frame_reward, done, info = self.env.step(action) if frame_reward == -15 or done: self.episode += 1 done = True if frame_reward == -15: reward = -15 * self.skip_frames else: reward = 15 * self.skip_frames raw_state = self.env.reset() break else: reward += frame_reward reward += (5 if (info["score"] - self.info["score"]) > 0 else 0) reward /= (15 * self.skip_frames) self.current_episode_reward += reward next_state = self.state_generator.get_stacked_frames( raw_state, done, frame_reward == 15 or (done and self.episode % 100 == 0), self.current_episode_reward) self.states.append(self.state) self.policies.append(np.squeeze(policy)) self.actions.append(action) self.rewards.append(reward) self.values.append(np.squeeze(value)) self.dones.append(done) self.state = next_state self.done = done self.info = info if self.done: self.episodeReward = self.current_episode_reward if self.maxEpisodeReward < self.episodeReward: self.maxEpisodeReward = self.episodeReward self.current_episode_reward = 0 def get_experiences(self): if self.done: next_state_value = 0 else: next_state_value = np.squeeze( self.agent.get_value(np.array([self.state]))) states = self.states actions = self.actions policies = self.policies rewards = self.rewards values = self.values dones = [1 if done else 0 for done in self.dones] next_values = values[1:] + [next_state_value] self.states = [] self.policies = [] self.actions = [] self.rewards = [] self.values = [] self.dones = [] return states, policies, actions, rewards, values, next_values, dones def get_action_size(self): return self.action_size def set_agent(self, agent): self.agent = agent def set_render(self, render): self.render = render def get_max_and_current_episode_reward(self): return self.maxEpisodeReward, self.episodeReward
if __name__ == "__main__": tf.reset_default_graph() gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) action_size = env.action_space.n # envs[0].set_render(True) train_model = A2CAgent("train_model", False, sess, input_shape, action_size, lr, GAMMA, LAMBDA, max_grad_norm, ent_coef, vf_coef, clip_range, True) while True: state_generator = StateGenerator(frame_size, stack_size) state = state_generator.get_stacked_frames(env.reset(), True) episodes_reward = 0 while True: policy, value = train_model.get_actions_and_values(np.array([state])) action = np.random.choice(np.arange(action_size), p=np.squeeze(policy)) for i in range(0, skip_frames): env.render() raw_state, frame_reward, done, info = env.step(action) if frame_reward == -15 or done: raw_state = env.reset() break