class Env: def __init__(self): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=True) self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary # by convention we want to use (0,1) # but the game uses (None, 119) self.action_map = self.env.getActionSet() #[None, 119] def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() # don't bother returning an info dictionary like gym return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
def run_a_game(self,game): from ple import PLE p = PLE(game,display_screen=True) agent = NaiveAgent(p.getActionSet()) p.init() reward = p.act(p.NOOP) for i in range(NUM_STEPS): obs = p.getScreenRGB() reward = p.act(agent.pickAction(reward,obs))
def main_naive(): game = FlappyBird() env = PLE(game, fps=30, display_screen=True) my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet()) env.init() reward = 0.0 nb_frames = 10000 for i in range(nb_frames): if env.game_over(): env.reset_game() observation = env.getScreenRGB() action = my_agent.pickAction(reward, observation) reward = env.act(action)
class PLECatcherEnv(gym.Env): metadata = {'render.modes': ['human', 'rgb_array']} def __init__(self, game_name='Catcher', display_screen=True, ple_game=True, obs_type="Image", reward_type = 1): ''' For Catcher: getGameState() returns [player x position, player velocity, fruits x position, fruits y position] @Params: obs_type : "RAM" : getGameState() "Image" : (64, 64, 3) reward_type : 0 : means [reward1, reward2] 1 : means raw reward 2 : means change of x-axis distance from fruit ''' # set headless mode os.environ['SDL_VIDEODRIVER'] = 'dummy' # open up a game state to communicate with emulator import importlib if ple_game: game_module_name = ('ple.games.%s' % game_name).lower() else: game_module_name = game_name.lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() ################################################################## # old one #self.game_state = PLE(game, fps=30, display_screen=display_screen) # use arg state_preprocessor to support self.game_state.getGameState() self.game_state = PLE(game, fps=30, display_screen=display_screen, state_preprocessor = self.process_state) ################################################################## self.game_state.init() self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_height, self.screen_width = self.game_state.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype = np.uint8) self.viewer = None ############################################ self.obs_type = obs_type self.reward_type = reward_type # every reward type's max-abs value self.rewards_ths = [1.0, 2.0] # change observation space: self.img_width = 84 self.img_height = 84 self.img_shape = (self.img_width, self.img_height, 3) if self.obs_type == "Image": self.observation_space = spaces.Box(low = 0, high = 255, shape = self.img_shape, dtype = np.uint8) elif self.obs_type == "RAM": self.observation_space = spaces.Box(low = -100.0, high = 100.0, shape = (4, ), dtype = np.float32) ############################################ ############################################# # Add state processer def process_state(self, state): return np.array([state.values()]) ############################################# def _step(self, a, gamma = 0.99): ############################################# if isinstance(a,np.ndarray): a = a[0] # old observation old_ram = self.game_state.getGameState() ############################################# reward = self.game_state.act(self._action_set[a]) ############################################# #state = self._get_image() if self.obs_type == "Image": state = self._get_image() ############################################# terminal = self.game_state.game_over() ############################################# # new observation ram = self.game_state.getGameState() ############################################# ############################################# if self.reward_type == 1: reward = reward / self.rewards_ths[0] # reward 2 if self.reward_type == 2: reward = self.get_reward(reward, old_ram, ram, terminal, 2, gamma) # reward 0 if self.reward_type == 0: reward1 = reward / self.rewards_ths[0] reward2 = self.get_reward(reward, old_ram, ram, terminal, 2, gamma) reward = np.array([reward1, reward2]) ############################################## ############################################################ ''' # reward scaling if self.reward_type == 0: for rt in range(len(reward)): reward[rt] = reward[rt] / self.rewards_ths[rt] else: reward = reward / self.rewards_ths[self.reward_type - 1] ''' ############################################################ ############################################## # obs if self.obs_type == "RAM": state = self.game_state.getGameState() state = np.array(list(state[0])) ############################################## return state, reward, terminal, {} ############################################# # Add for reward ############################################# def get_reward(self, src_reward, old_ram, ram, done, reward_type, gamma): ''' @Params: old_ram, ram : numpy.array, [dict_values([x, y, z, w])] reward_type : 2 , distance of x-axis change ''' old_ram = list(old_ram[0]) ram = list(ram[0]) reward = src_reward if not done: if reward_type == 2: old_px, old_fx = old_ram[0], old_ram[2] px, fx = ram[0], ram[2] old_dis = abs(old_px - old_fx) dis = abs(px - fx) reward = old_dis - gamma * dis # a new epoch old_fy, fy = old_ram[3], ram[3] if old_fy > fy: reward = 0.0 reward = min(reward, 2.0) reward = max(reward, -2.0) reward = src_reward / self.rewards_ths[0] + reward / self.rewards_ths[1] return reward ############################################# ############################################# def _get_image(self): image_rotated = np.fliplr(np.rot90(self.game_state.getScreenRGB(),3)) # Hack to fix the rotated image returned by ple ########################################## # resize image img = Image.fromarray(image_rotated) img = img.resize((self.img_width, self.img_height), Image.ANTIALIAS) image_resized = np.array(img).astype(np.uint8) ########################################## return image_resized @property def _n_actions(self): return len(self._action_set) # return: (states, observations) def _reset(self): self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype = np.uint8) self.game_state.reset_game() ####################################### if self.obs_type == "Image": state = self._get_image() elif self.obs_type == "RAM": state = self.game_state.getGameState() state = np.array(list(state[0])) ####################################### return state def _render(self, mode='human', close=False): if close: if self.viewer is not None: self.viewer.close() self.viewer = None return img = self._get_image() if mode == 'rgb_array': return img elif mode == 'human': from gym.envs.classic_control import rendering if self.viewer is None: self.viewer = rendering.SimpleImageViewer() self.viewer.imshow(img) def _seed(self, seed): rng = np.random.RandomState(seed) self.game_state.rng = rng self.game_state.game.rng = self.game_state.rng self.game_state.init()
display_screen = True reward = 0.0 max_noops = 20 nb_frames = 15000 #make a PLE instance. p = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) #our Naive agent! agent = NaiveAgent(p.getActionSet()) #init agent and game. p.init() #lets do a random number of NOOP's for i in range(np.random.randint(0, max_noops)): reward = p.act(p.NOOP) #start our training loop for f in range(nb_frames): #if the game is over if p.game_over(): p.reset_game() obs = p.getScreenRGB()
actionIndex = -1 actionIndex0 = -1 else: actionIndex0=softMax(np.abs(output))#np.argmax(output)+1 print(actionIndex0) if actionIndex0==0: if np.sign(actionIndex0)==1: actionIndex = 0 else: actionIndex = 3 elif actionIndex0==1: if np.sign(actionIndex0)==1: actionIndex = 1 else: actionIndex = 2 action = p.getActionSet()[actionIndex]#ulrdn #[115, 100, 119, 97, None] # myAgent.pickAction(reward, obs) reward = p.act(action) print('reward: %f'%(reward)) #points += (reward - rewardLast)-0.0001*dt*points points = (10+reward)/10+(reward - rewardLast) if reward>-10 else reward - rewardLast print('points: %f'%(points)) ModulatorAmount = points if ModulatorAmount<0: ModulatorAmount=0 ADSA.StepSynapseDynamics(dt, ModulatorAmount) rewardLast=reward NeuonNumber=1 newSlice= [slice(None)]*3
from ple.games.waterworld import WaterWorld # lets adjust the rewards our agent recieves rewards = { "tick": -0.01, # each time the game steps forward in time the agent gets -0.1 "positive": 1.0, # each time the agent collects a green circle "negative": -5.0, # each time the agent bumps into a red circle } # make a PLE instance. # use lower fps so we can see whats happening a little easier game = WaterWorld(width=256, height=256, num_creeps=8) p = PLE(game, fps=15, force_fps=False, display_screen=True, reward_values=rewards) # we pass in the rewards and PLE will adjust the game for us p.init() actions = p.getActionSet() for i in range(1000): if p.game_over(): p.reset_game() action = actions[np.random.randint(0, len(actions))] # random actions reward = p.act(action) print "Score: {:0.3f} | Reward: {:0.3f} ".format(p.score(), reward)
def train_agent(number_of_episodes): game = FlappyBird() rewards = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": -5.0, "win": 0.0 } env = PLE(game=game, fps=30, display_screen=False, reward_values=rewards) # Reset environment at the beginning env.reset_game() training_score = 0 max_training_score = 0 episode_number = 1 state_action_reward = () every_100th = 1 results = [] while number_of_episodes > 0: if episode_number == 50000: f = open("monte_50000.txt", "w") f.write(str(monte_carlo_q_agent.Q_matrix)) f.close() f = open("results_50000.txt", "w") f.write(str(results)) f.close() # Get current state state = MonteCarloQLearningAgent.get_state(env.game.getGameState()) # Select action in state "state" action = monte_carlo_q_agent.compute_action_from_q_values(state) if action is None: raise IllegalActionException("Illegal action occurred.") """ After choosing action, get reward. PLE environment method act() returns the reward that the agent has accumulated while performing the action. """ reward = env.act(env.getActionSet()[action]) training_score += reward max_training_score = max(training_score, max_training_score) game_over = env.game_over() # observe the result if state_action_reward: monte_carlo_q_agent.update(state_action_reward[0], state_action_reward[1], state, state_action_reward[2]) state_action_reward = (state, action, reward) if game_over: print("===========================") print("Episode: " + str(episode_number)) print("Training score: " + str(training_score)) print("Max. training score: " + str(max_training_score)) print("===========================\n") if every_100th == 100: results.append((episode_number, training_score)) every_100th = 0 episode_number += 1 number_of_episodes -= 1 training_score = 0 env.reset_game()
import numpy as np from ple import PLE from ple.games.snake import Snake agent = Snake(width=256, height=256) env = PLE(agent, fps=15, force_fps=False, display_screen=True) env.init() actions = env.getActionSet() q_states = {((1, 1), 0): 0} count_q = {(0, 0): 0} w = np.random.rand(4) alpha = 0.5 gama = 1 epsilon = 0.7 steps = 1 # checked: correct :) def compute_sprim(state, action): new_state = (0, 0) if action == 119: new_state = (state[0], state[1] + 1) if action == 97: new_state = (state[0] + 1, state[1]) if action == 100: new_state = (state[0] - 1, state[1]) if action == 115:
class CustomGameEnv(gym.Env): def __init__(self, task={}): self._task = task os.environ['SDL_VIDEODRIVER'] = 'dummy' import importlib game_module = importlib.import_module('ple.games.customgame') game = getattr(game_module, 'customgame')() self.game_state = PLE(game, fps=30, display_screen=False) self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_width, self.screen_height = self.game_state.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.num_actions = len(self._action_set) self.viewer = None self.reward_mult = 1.0 def seed(self, seed=None): if not seed: seed = np.random.randint(2**31 - 1) rng = np.random.RandomState(seed) self.game_state.rng = rng self.game_state.game.rng = self.game_state.rng self.game_state.init() return [seed] def reset_task(self, task): pass def render(self, mode='human'): img = self._get_image() if mode == 'rgb_array': return img elif mode == 'human': from gym.envs.classic_control import rendering if self.viewer is None: self.viewer = rendering.SimpleImageViewer() self.viewer.imshow(img) def reset(self): self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.game_state.reset_game() state = self._get_image() return state def _get_image(self): image_rotated = np.fliplr( np.rot90(self.game_state.getScreenRGB(), 3)) # Hack to fix the rotated image returned by ple return image_rotated def step(self, action): reward = self.reward_mult * self.game_state.act( self._action_set[action]) state = self._get_image() terminal = self.game_state.game_over() return state, reward, terminal, {}
r = p.act(self.action_set[action]) if r == 0: r = 1 if r == 1: r = 10 else: r = -1000 return r if __name__ == "__main__": episodes = 2000_000000 game = FlappyBird() p = PLE(game, fps=30, display_screen=False) p.init() agent = Agent(p.getActionSet()) max_score = 0 for episode in range(episodes): p.reset_game() state = agent.get_state(game.getGameState()) agent.update_greedy() while True: action = agent.get_best_action(state) reward = agent.act(p, action) next_state = agent.get_state(game.getGameState()) agent.update_q_table(state, action, next_state, reward) current_score = p.score() state = next_state if p.game_over(): max_score = max(current_score, max_score)
# create our game force_fps = True # slower speed display_screen = True state_preprocessor = nv_state_preprocessor reward = 0.0 game = WaterWorld() # make a PLE instance. p = PLE(game, force_fps=force_fps, display_screen=display_screen, state_preprocessor=state_preprocessor) # our Naive agent! agent = SmartAgent(actions=p.getActionSet()) # init agent and game. p.init() # start our loop score = 0.0 for i in range(10): # if the game is over if p.game_over(): p.reset_game() while p.game_over() == False: obs = p.getGameState() action = agent.pickAction(reward, obs) reward = p.act(action) # reward after an action score = game.getScore()
class FlappyBirdGame(): def __init__(self, reward_values={}, reward_discount=0.99, pip_gap=100, display_screen=True, fps=30, force_fps=True): self.game = PLE(FlappyBird(pipe_gap=pip_gap), reward_values=reward_values, fps=fps, force_fps=force_fps, display_screen=display_screen) self.game.init() self.actions = self.game.getActionSet() self.reward_discount = reward_discount @staticmethod def random_agent(*args, **kwargs): return torch.rand(1) def calculate_trial_reward(self, rewards_tensor): rewards_output = torch.empty(rewards_tensor.shape[0]) for i in range(rewards_tensor.shape[0]): discount_vector = torch.Tensor([self.reward_discount] * (rewards_tensor.shape[0] - i)) pv_rewards = sum(rewards_tensor[i:] * discount_vector**torch.FloatTensor( range(rewards_tensor.shape[0] - i))) rewards_output[i] = pv_rewards rewards_output = rewards_output.reshape((-1, 1)) return rewards_output @staticmethod def observation_to_torch_tensor(observation): obs_tensor = torch.FloatTensor([ observation['player_y'], observation['player_vel'], observation['next_pipe_dist_to_player'], observation['next_pipe_top_y'], observation['next_pipe_bottom_y'], observation['next_next_pipe_dist_to_player'], observation['next_next_pipe_top_y'], observation['next_next_pipe_bottom_y'] ]) obs_tensor = obs_tensor.reshape((1, 8)) return obs_tensor def run_trial(self, agent=None, sample=True, verbose=False): if agent is None: agent = self.random_agent if self.game.game_over(): self.game.reset_game() rewards = torch.empty(0) observations = torch.empty((0, 8)) agent_decisions = torch.empty((0, 1)) actual_decisions = torch.empty((0, 1)) while not self.game.game_over(): observation = self.observation_to_torch_tensor( self.game.getGameState()) agent_decision = agent(observation) if sample: actual_decision = torch.bernoulli(agent_decision) else: actual_decision = torch.FloatTensor( [1]) if agent_decision > 0.5 else torch.FloatTensor([0]) actual_decision = actual_decision.reshape((1, 1)) agent_decision = agent_decision.reshape((1, 1)) if actual_decision == 1: action = self.actions[1] else: action = self.actions[0] reward = torch.FloatTensor([self.game.act(action)]) # reward shaping # if (observation[0][0] < observation[0][4]) and (observation[0][0] > observation[0][3]): # reward = torch.add(reward, torch.tensor(0.2)) # else: # reward = torch.add(reward, torch.tensor(-0.2)) rewards = torch.cat((rewards, reward)) observations = torch.cat((observations, observation)) agent_decisions = torch.cat((agent_decisions, agent_decision)) actual_decisions = torch.cat((actual_decisions, actual_decision)) if verbose: print(f'action: {action}') print(f'observation: {observation}') print(f'reward: {reward}') return { 'observations': observations, 'rewards': self.calculate_trial_reward(rewards), 'agent_decisions': agent_decisions, 'actual_decisions': actual_decisions } def run_n_trials(self, n_trials, agent=None, sample=True): out_results = { 'observations': torch.empty(0), 'rewards': torch.empty(0), 'agent_decisions': torch.empty(0), 'actual_decisions': torch.empty(0) } for i in range(n_trials): results = self.run_trial(agent, sample) out_results['observations'] = torch.cat( (out_results['observations'], results['observations'])) out_results['rewards'] = torch.cat( (out_results['rewards'], results['rewards'])) out_results['agent_decisions'] = torch.cat( (out_results['agent_decisions'], results['agent_decisions'])) out_results['actual_decisions'] = torch.cat( (out_results['actual_decisions'], results['actual_decisions'])) return out_results
return self.qLearning.getAction(state) def incorporateFeedback(self, state, action, reward, newState): self.qLearning.incorporateFeedback(state, action, reward, newState) def printWeights(self): print str(self.qLearning.getWeights()) print 'num weights: %d' % len(self.qLearning.getWeights()) ############################################################ if __name__ == '__main__': game = Pixelcopter(width=200, height=200) env = PLE(game, fps=30, display_screen=displayScreen) agent = Bot(actions=env.getActionSet()) env.init() total_reward = 0.0 min_reward = float('inf') max_reward = float('-inf') all_episode_scores = [] plot_episode_scores = [] plotted_episodes = [] for i in range(num_runs): # should run until qvalues converge episode_reward = 0.0 frames = [] while not env.game_over(): state = game.getGameState()
obs = list(ple_env.getGameState().values()) episode_reward += reward # if render: # ple_env.getScreenRGB() if ple_env.game_over(): break eval_reward.append(episode_reward) return np.mean(eval_reward) # 创建环境 game = Pong(cpu_speed_ratio=0.3) # game = Pong() pong = PLE(game, display_screen=True, force_fps=True) # 根据parl框架构建agent print(pong.getActionSet()) action_dim = len(pong.getActionSet()) obs_shape = len(pong.getGameState()) print(pong.getGameState()) # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 model = Model(act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.1, # 有一定概率随机选取动作,探索 e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低
class PLEEnv(gym.Env): metadata = {'render.modes': ['human', 'rgb_array']} def __init__(self, prespecified_game=True, game_name='MyCatcher', display_screen=True, rgb_state=False): # open up a game state to communicate with emulator import importlib if prespecified_game: game_module_name = ('ple.games.%s' % game_name).lower() else: game_module_name = ('domains.ple.%s' % game_name).lower() game_module = importlib.import_module(game_module_name) self.game = getattr(game_module, game_name)() self.rgb_state = rgb_state if self.rgb_state: self.game_state = PLE(self.game, fps=30, display_screen=display_screen) else: if prespecified_game: self.game_state = PLE( self.game, fps=30, display_screen=display_screen, state_preprocessor=process_state_prespecified) else: self.game_state = PLE(self.game, fps=30, display_screen=display_screen, state_preprocessor=process_state) self.game_state.init() self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) if self.rgb_state: self.state_width, self.state_height = self.game_state.getScreenDims( ) self.observation_space = spaces.Box(low=0, high=255, shape=(self.state_width, self.state_height, 3)) else: self.state_dim = self.game_state.getGameStateDims() self.observation_space = spaces.Box(low=0, high=255, shape=self.state_dim) self.viewer = None self.feature_bins = [] if hasattr(self.game, 'feature_bins'): self.feature_bins = self.game.feature_bins def get_source_state(self, state): if hasattr(self.game, 'get_source_state'): return self.game.get_source_state(state) return None def get_uniform_state_weights(self): if hasattr(self.game, 'get_uniform_state_weights'): return self.game.get_uniform_state_weights() else: states = self.get_states() weights = np.ones(len(states)) weights = [float(i) / sum(weights) for i in weights] return states, weights def generate_training_subset(self, percent_sim_data): if hasattr(self.game, 'generate_training_subset'): return self.game.generate_training_subset(percent_sim_data) def set_to_training_set(self): if hasattr(self.game, 'set_to_training_set'): return self.game.set_to_training_set() def set_to_testing_set(self): if hasattr(self.game, 'set_to_testing_set'): return self.game.set_to_testing_set() def get_states(self): if hasattr(self.game, 'states'): return self.game.states def _step(self, a): reward = self.game_state.act(self._action_set[a]) state = self._get_state() terminal = self.game_state.game_over() return state, reward, terminal, {} def _get_image(self, game_state): image_rotated = np.fliplr( np.rot90(game_state.getScreenRGB(), 3)) # Hack to fix the rotated image returned by ple return image_rotated def _get_state(self): if self.rgb_state: return self._get_image(self.game_state) else: return self.game_state.getGameState() @property def _n_actions(self): return len(self._action_set) # return: (states, observations) def _reset(self): if self.rgb_state: self.observation_space = spaces.Box(low=0, high=255, shape=(self.state_width, self.state_height, 3)) else: self.observation_space = spaces.Box(low=0, high=255, shape=self.state_dim) self.game_state.reset_game() state = self._get_state() return state def _render(self, mode='human', close=False): if close: if self.viewer is not None: self.viewer.close() self.viewer = None return img = self._get_image(self.game_state) if mode == 'rgb_array': return img elif mode == 'human': from gym.envs.classic_control import rendering if self.viewer is None: self.viewer = rendering.SimpleImageViewer() self.viewer.imshow(img) def _seed(self, seed): rng = np.random.RandomState(seed) self.game_state.rng = rng self.game_state.game.rng = self.game_state.rng self.game_state.init()
import numpy as np import pygame from pygame.locals import * class TestAgent(): def __init__(self, actions): self.actions = actions def doAction(self,reward,obs): #print 'hello' for event in pygame.event.get(): if event.type == KEYDOWN: return self.actions[0] return None game = RunningMinion() #game = WaterWorld() p = PLE(game, fps=30, display_screen=True) agent = TestAgent(p.getActionSet()) p.init() reward = 0.0 nb_frames = 2000 for i in range(nb_frames): if p.game_over(): p.reset_game() if i%1==0: obser = p.getScreenRGB() action = agent.doAction(reward,obser) reward = p.act(action)
rewards = { "tick": -0.1, # each time the game steps forward in time the agent gets -0.1 "positive": 1, # each time the agent collects a green circle "negative": -5.0, # each time the agent bumps into a red circle } # make a PLE instance. # use lower fps so we can see whats happening a little easier game = WaterWorld(width=100, height=100, num_creeps=15) # p = PLE(game, reward_values=rewards) p = PLE(game, fps=30, force_fps=False, display_screen=True, reward_values=rewards) p.init() actions = p.getActionSet()[:-1] agent = Agent(len(actions)) epochs = 10000000 game_duration = 1000 for epoch in range(epochs): p.reset_game() for it in range(1000): if p.game_over(): p.reset_game() print "Finished with score:" + str(p.score()) current_state = np.array(p.getScreenGrayscale()).reshape((10000, )) action = agent.act(np.array([current_state]))
return self.actions[1] elif fwd[1] < 0 and abs(fwd[1]) > abs(fwd[0]): return self.actions[2] elif fwd[0] < 0 and abs(fwd[0]) > abs(fwd[1]): return self.actions[3] else: return self.actions[4] os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" # create our game force_fps = True # slower speed display_screen = False game = WaterWorld() # make a PLE instance. p = PLE(game,force_fps=force_fps) # init agent and game. p.init() p.display_screen = True reward = 0 agent = MyAgent(p.getActionSet()) while p.game_over() == False: state = p.getGameState() action = agent.pickAction(reward, state) reward = p.act(action) print p.score()
catcher_dict['state_stds'] = [ 13.89457683, 2.04087944, 17.41686248, 23.38546788 ] game_params = {'cartpole': cartpole_dict, 'catcher': catcher_dict} if __name__ == "__main__": # Initiate cartpole envs cartpole_env = gym.make('CartPole-v1') # Initiate catcher envs catcher_env = PLE(Catcher(init_lives=1), state_preprocessor=process_state, display_screen=False) catcher_env.init() game_params['catcher']['actions'] = catcher_env.getActionSet() envs = {'cartpole': cartpole_env, 'catcher': catcher_env} # Initialise the first task: cartpole curr_task = sim_params['first_task'] env = envs[curr_task] # Multiple replay databases maintained if multitasking if train_params['multitask']: mem_length = train_params['replay_sizes'] else: mem_length = game_params[curr_task]['memory_size'] # Create agent
reward = ple_env.act(action) obs = list(ple_env.getGameState().values()) episode_reward += reward if render: ple_env.getScreenRGB() if ple_env.game_over(): break eval_reward.append(episode_reward) return np.mean(eval_reward) # 创建环境 game = FlappyBird() p = PLE(game, fps=30, display_screen=True, force_fps=True) act_dim = len(p.getActionSet()) states = len(p.getGameState()) rpm = ReplayMemory(MEMORY_SIZE) model = Model(act_dim=act_dim) alg = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=states, act_dim=act_dim, e_greed_decrement=0, e_greed=0) # e_greed有一定概率随机选取动作,探索 # 加载模型 # if os.path.exists('./model.ckpt'): # agent.restore('./model.ckpt')
game = FlappyBird() p = PLE(game, fps=30, display_screen=True, force_fps=False, state_preprocessor=process_state) p.init() game.ple = p p.init() #print(p.getActionSet()) #agent action_set = p.getActionSet() agent = RandomSearch(len(action_set), p.getGameStateDims()[1]) # agent.load("flappy1_100.h5") nb_games = 1 nb_frames = 0 flag_game_10 = False flag_game_100 = False flag_game_50 = False score_game = 0 last_50_games_score = deque(maxlen=50) EXPLORE = 5000000 #small is 300000, big is 5000000 FINAL_EPSILON = 0.0001
class PygameLearningEnvironment(Environment): def __init__(self, game_name, rewards, state_as_image = True, fps = 30, force_fps=True, frame_skip=2, hold_action=2, visualize=False, width=84, height=84, lives=1): """ Initialize Pygame Learning Environment https://github.com/ntasfi/PyGame-Learning-Environment Args: env_name: PLE environment fps: frames per second force_fps: False for slower speeds frame_skip: number of env frames to skip hold_action: number of env frames to hold each action for isRGB: get color or greyscale version of statespace #isRGB = False, game_height,game_width: height and width of environment visualize: If set True, the program will visualize the trainings, will slow down training lives: number of lives in game. Game resets on game over (ie lives = 0). only in Catcher and Pong (score) """ self.env_name = game_name self.rewards = rewards self.lives = lives self.state_as_image = state_as_image self.fps = fps #30 # frames per second self.force_fps = force_fps #True # False for slower speeds self.frame_skip = frame_skip # frames to skip self.ple_num_steps = hold_action # frames to continue action for #self.isRGB = isRGB #always returns color, lets tensorforce due the processing self.visualize = visualize self.width = width self.height = height #testing self.reached_terminal = 0 self.episode_time_steps = 0 self.episode_reward = 0 self.total_time_steps = 0 if self.env_name == 'catcher': self.game = Catcher(width=self.width, height=self.height,init_lives=self.lives) elif self.env_name == 'pixelcopter': self.game = Pixelcopter(width=self.width, height=self.height) elif self.env_name == 'pong': self.game = Pong(width=self.width, height=self.height,MAX_SCORE=self.lives) elif self.env_name == 'puckworld': self.game = PuckWorld(width=self.width, height=self.height) elif self.env_name == 'raycastmaze': self.game = RaycastMaze(width=self.width, height=self.height) elif self.env_name == 'snake': self.game = Snake(width=self.width, height=self.height) elif self.env_name == 'waterworld': self.game = WaterWorld(width=self.width, height=self.height) elif self.env_name == 'monsterkong': self.game = MonsterKong() elif self.env_name == 'flappybird': self.game = FlappyBird(width=144, height=256) # limitations on height and width for flappy bird else: raise TensorForceError('Unknown Game Environement.') if self.state_as_image: process_state = None else: #create a preprocessor to read the state dictionary as a numpy array def process_state(state): # ret_value = np.fromiter(state.values(),dtype=float,count=len(state)) ret_value = np.array(list(state.values()), dtype=np.float32) return ret_value # make a PLE instance self.env = PLE(self.game,reward_values=self.rewards,fps=self.fps, frame_skip=self.frame_skip, num_steps=self.ple_num_steps,force_fps=self.force_fps,display_screen=self.visualize, state_preprocessor = process_state) #self.env.init() #self.env.act(self.env.NOOP) #game starts on black screen #self.env.reset_game() #self.env.act(self.env.NOOP) #self.env.act(self.env.NOOP) #self.env.act(self.env.NOOP) #self.env.act(self.env.NOOP) #self.env.reset_game() # setup gamescreen object if state_as_image: w, h = self.env.getScreenDims() self.gamescreen = np.empty((h, w, 3), dtype=np.uint8) else: self.gamescreen = np.empty(self.env.getGameStateDims(), dtype=np.float32) # if isRGB: # self.gamescreen = np.empty((h, w, 3), dtype=np.uint8) # else: # self.gamescreen = np.empty((h, w), dtype=np.uint8) # setup action converter # PLE returns legal action indexes, convert these to just numbers self.action_list = self.env.getActionSet() self.action_list = sorted(self.action_list, key=lambda x: (x is None, x)) def __str__(self): return 'PygameLearningEnvironment({})'.format(self.env_name) def close(self): pygame.quit() self.env = None def reset(self): # if isinstance(self.gym, gym.wrappers.Monitor): # self.gym.stats_recorder.done = True #env.act(env.NOOP) # need to take an action or screen is black # clear gamescreen if self.state_as_image: self.gamescreen = np.empty(self.gamescreen.shape, dtype=np.uint8) else: self.gamescreen = np.empty(self.gamescreen.shape, dtype=np.float32) self.env.reset_game() return self.current_state def execute(self, actions): #print("lives check in ple {}".format(self.env.lives())) #self.env.saveScreen("test_screen_capture_before_{}.png".format(self.total_time_steps)) #lives_check = self.env.lives() #testing code ple_actions = self.action_list[actions] reward = self.env.act(ple_actions) state = self.current_state # testing code # self.env.saveScreen("test_screen_capture_after_{}.png".format(self.total_time_steps)) # self.episode_time_steps += 1 # self.episode_reward += reward # self.total_time_steps += 1 # print("reward is {}".format(reward)) # #if self.env.lives() != lives_check: # # print('lives are different is game over? {}'.format(self.env.game_over())) # print('lives {}, game over {}, old lives {}'.format(self.env.lives(),self.env.game_over(),lives_check)) if self.env.game_over(): terminal = True # testing code self.reached_terminal += 1 # print("GAME OVER reached terminal {}".format(self.reached_terminal)) # print("episode time steps {}, episode reward {}".format(self.episode_time_steps,self.episode_reward)) # self.episode_reward = 0 # self.episode_time_steps = 0 # print("total timesteps {}".format(self.total_time_steps)) else: terminal = False return state, terminal, reward @property def actions(self): return dict(type='int', num_actions=len(self.action_list), names=self.action_list) # @property # def actions(self): # return OpenAIGym.action_from_space(space=self.gym.action_space) #ALE implementation # @property # def actions(self): # return dict(type='int', num_actions=len(self.action_inds), names=self.action_names) @property def states(self): return dict(shape=self.gamescreen.shape, type=float) @property def current_state(self): #returned state can either be an image or an np array of key components if self.state_as_image: self.gamescreen = self.env.getScreenRGB() # if isRGB: # self.gamescreen = self.env.getScreenRGB() # else: # self.gamescreen = self.env.getScreenGrayscale() else: self.gamescreen = self.env.getGameState() return np.copy(self.gamescreen) #ALE implementation # @property # def states(self): # return dict(shape=self.gamescreen.shape, type=float) # @property # def current_state(self): # self.gamescreen = self.ale.getScreenRGB(self.gamescreen) # return np.copy(self.gamescreen) # @property # def is_terminal(self): # if self.loss_of_life_termination and self.life_lost: # return True # else: # return self.ale.game_over()
class PLEWaterWorldEnv(gym.Env): metadata = {'render.modes': ['human', 'rgb_array']} def __init__(self, game_name='WaterWorld', display_screen=True, ple_game=True, obs_type="Image", reward_type=1): ''' For WaterWorld: getGameState() returns [player x position, player y position, player x velocity, player y velocity, player distance to each creep] player distance to each creep is a dict with "GOOD" : [], "BAD" : [] @Params: obs_type : "RAM" : getGameState() "Image" : (48, 48, 3) reward_type : 0 : means [reward1, reward2] 1 : means raw reward 2 : means change of dis = sum(distance_from_good) - sum(distance_from_bad) ''' # set headless mode os.environ['SDL_VIDEODRIVER'] = 'dummy' # open up a game state to communicate with emulator import importlib if ple_game: game_module_name = ('ple.games.%s' % game_name).lower() else: game_module_name = game_name.lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() ################################################################## # old one #self.game_state = PLE(game, fps=30, display_screen=display_screen) # use arg state_preprocessor to support self.game_state.getGameState() self.game_state = PLE(game, fps=30, display_screen=display_screen, state_preprocessor=self.process_state) ################################################################## self.game_state.init() self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_height, self.screen_width = self.game_state.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8) self.viewer = None ############################################ self.obs_type = obs_type self.reward_type = reward_type # every reward type's max-abs value self.rewards_ths = [10.0, 5.0] # change observation space: self.img_width = 84 self.img_height = 84 self.img_shape = (self.img_width, self.img_height, 3) if self.obs_type == "Image": self.observation_space = spaces.Box(low=0, high=255, shape=self.img_shape, dtype=np.uint8) else: print("Water world only supports image observation!") sys.exit(0) ############################################ ############################################# # Add state processer def process_state(self, state): return np.array([state.values()]) ############################################# def _step(self, a, gamma=0.99): ############################################# # old observation old_ram = self.game_state.getGameState() ############################################# reward = self.game_state.act(self._action_set[a]) state = self._get_image() terminal = self.game_state.game_over() ############################################# # new observation ram = self.game_state.getGameState() ############################################# ############################################# # reward 2 if self.reward_type == 2: reward = self.get_reward(old_ram, ram, terminal, 2, gamma) # reward 0 if self.reward_type == 0: reward1 = reward reward2 = self.get_reward(old_ram, ram, terminal, 2, gamma) reward = np.array([reward1, reward2]) ############################################## ############################################################ # reward scaling if self.reward_type == 0: for rt in range(len(reward)): reward[rt] = reward[rt] / self.rewards_ths[rt] else: reward = reward / self.rewards_ths[self.reward_type - 1] ############################################################ return state, reward, terminal, {} ############################################# # Add for reward ############################################# def get_reward(self, old_ram, ram, done, reward_type, gamma=0.99): ''' @Params: old_ram, ram : numpy.array, [dict_values([x, y, z, w, {"GOOD" : [], "BAD" : []}])] reward_type : 2 , change of distance from good - bad ''' old_ram = list(old_ram[0]) ram = list(ram[0]) reward = 0.0 if not done: if reward_type == 2: old_goods = np.array(old_ram[4]["GOOD"]) old_bads = np.array(old_ram[4]["BAD"]) goods = np.array(ram[4]["GOOD"]) bads = np.array(ram[4]["BAD"]) mean_old_goods = np.mean( old_goods) if len(old_goods) > 0 else 0.0 mean_old_bads = np.mean(old_bads) if len(old_bads) > 0 else 0.0 mean_goods = np.mean(goods) if len(goods) > 0 else 0.0 mean_bads = np.mean(bads) if len(bads) > 0 else 0.0 old_sum_dis = mean_old_goods - mean_old_bads sum_dis = mean_goods - mean_bads reward = old_sum_dis - gamma * sum_dis if reward > 5.0: reward = 5.0 elif reward < -5.0: reward = -5.0 return reward ############################################# ############################################# def _get_image(self): image_rotated = np.fliplr( np.rot90(self.game_state.getScreenRGB(), 3)) # Hack to fix the rotated image returned by ple ########################################## # resize image img = Image.fromarray(image_rotated) img = img.resize((self.img_width, self.img_height), Image.ANTIALIAS) image_resized = np.array(img).astype(np.uint8) ########################################## return image_resized @property def _n_actions(self): return len(self._action_set) # return: (states, observations) def _reset(self): self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8) self.game_state.reset_game() state = self._get_image() return state def _render(self, mode='human', close=False): if close: if self.viewer is not None: self.viewer.close() self.viewer = None return img = self._get_image() if mode == 'rgb_array': return img elif mode == 'human': from gym.envs.classic_control import rendering if self.viewer is None: self.viewer = rendering.SimpleImageViewer() self.viewer.imshow(img) def _seed(self, seed): rng = np.random.RandomState(seed) self.game_state.rng = rng self.game_state.game.rng = self.game_state.rng self.game_state.init()
if jump > not_jump: return 0 else: return 1 def update_Q(self, s, s_prime, reward, action): self.Q[s[0], s[1], s[2], action] = (1 - self._alpha) * self.Q[ s[0], s[1], s[2], action] + self._alpha * ( reward + self._lambda * np.max(self.Q[s_prime[0], s_prime[1], s_prime[2]])) if __name__ == "__main__": game = FlappyBird() p = PLE(game, fps=30, display_screen=True) agent = Agent(action_space=p.getActionSet(), grid_size=10) p.init() s = agent.get_current_state(game.getGameState()) episodes = 0 max_score = 0 while True: # Find the optimal action based on the current state max_action = agent.optimal_action(s) current_score = p.score() max_score = max(current_score, max_score) # Perform the optimal action and return the reward
class MyEnv(Environment): VALIDATION_MODE = 0 # original size is 288x512 so dividing def __init__(self, rng, game=None, frame_skip=4, ple_options={"display_screen": True, "force_fps":True, "fps":30}): self._mode = -1 self._mode_score = 0.0 self._mode_episode_count = 0 self._frame_skip = frame_skip if frame_skip >= 1 else 1 self._random_state = rng self._hist_size = 1 if game is None: raise ValueError("Game must be provided") self._ple = PLE(game, **ple_options) self._ple.init() self._actions = self._ple.getActionSet() self._state_size = self._ple.getGameStateDims()[0] self._state_saved = np.zeros((self._state_size), dtype=np.float32) self.previous_score = 0. self.episode_scores = [] def reset(self, mode): if mode == MyEnv.VALIDATION_MODE: if self._mode != MyEnv.VALIDATION_MODE: self._mode = MyEnv.VALIDATION_MODE self._mode_score = 0.0 self.episode_scores = [] self.previous_score = .0 self._mode_episode_count = 0 else: self._mode_episode_count += 1 self.episode_scores.append(self._mode_score - self.previous_score) self.previous_score = self._mode_score elif self._mode != -1: # and thus mode == -1 self._mode = -1 # print("Dead at score {}".format(self._ple.game.getScore())) self._ple.reset_game() for _ in range(self._random_state.randint(self._hist_size)): self._ple.act(self._ple.NOOP) return [[[0] * self._state_size] * self._hist_size] def act(self, action): action = self._actions[action] reward = 0 for _ in range(self._frame_skip): reward += self._ple.act(action) if self.inTerminalState(): break self._state_saved = self._ple.getGameState() self._mode_score += reward if self.inTerminalState(): pass return reward #np.sign(reward) def summarizePerformance(self, test_data_set): if self.inTerminalState() == False: self._mode_episode_count += 1 maxscore = max(self.episode_scores) if len(self.episode_scores) else "N/A" print("== Max score of episode is {} over {} episodes ==".format( maxscore, self._mode_episode_count)) def inputDimensions(self): return [(self._hist_size, self._state_size)] def observationType(self, subject): return np.float32 def nActions(self): return len(self._actions) def observe(self): return [np.array(self._state_saved)] def inTerminalState(self): return self._ple.game_over()
def score(self, training=True, nb_episodes=10): reward_values = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": 0.0, "win": 0.0 } env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() total_episodes = nb_episodes score = 0 scores = [] while nb_episodes > 0: # pick an action state = env.game.getGameState() action = self.policy(state) # step the environment reward = env.act(env.getActionSet()[action]) score += reward # reset the environment if the game is over if env.game_over() or score >= 100: scores.append(score) env.reset_game() nb_episodes -= 1 score = 0 # print(nb_episodes) avg_score = sum(scores) / float(len(scores)) confidence_interval = st.t.interval(0.95, len(scores) - 1, loc=np.mean(scores), scale=st.sem(scores)) if np.isnan(confidence_interval[0]): confidence_interval = (avg_score, avg_score) print("Games played: {}".format(total_episodes)) print("Average score: {}".format(avg_score)) print("95 confidence interval: {}".format(confidence_interval)) if training: score_file = "{}/scores.csv".format(self.name) # If file doesn't exist, add the header if not os.path.isfile(score_file): with open(score_file, "ab") as f: f.write( "avg_score,episode_count,frame_count,interval_lower,interval_upper,min,max\n" ) # Append scores to the file with open(score_file, "ab") as f: f.write("{},{},{},{},{},{},{}\n".format( avg_score, self.episode_count, self.frame_count, confidence_interval[0], confidence_interval[1], min(scores), max(scores))) count = 0 for score in scores: if score >= 50: count += 1 if count >= len(scores) * 0.9: print("*** over 50 score in {} frames ***".format( self.frame_count)) with open("pass_50.csv", "ab") as f: f.write("{},{}\n".format(self.name, self.frame_count)) else: with open("scores.txt", "ab") as f: for score in scores: f.write("{},{}\n".format(self.name, score))
class MyEnv(Environment): VALIDATION_MODE = 0 memSize = 4 # original size is 288x512 so dividing dividing_factor = 8 width = 288 // dividing_factor height = 512 // dividing_factor def __init__(self, rng, game=None, frame_skip=4, ple_options={ "display_screen": True, "force_fps": True, "fps": 30 }): self._mode = -1 self._mode_score = 0.0 self._mode_episode_count = 0 self._frame_skip = frame_skip if frame_skip >= 1 else 1 self._random_state = rng if game is None: raise ValueError("Game must be provided") self._ple = PLE(game, **ple_options) self._ple.init() w, h = self._ple.getScreenDims() self._screen = np.empty((w, h), dtype=np.uint8) self._reduced_screen = np.empty((self.width, self.height), dtype=np.uint8) self._actions = self._ple.getActionSet() def reset(self, mode): if mode == MyEnv.VALIDATION_MODE: if self._mode != MyEnv.VALIDATION_MODE: self._mode = MyEnv.VALIDATION_MODE self._mode_score = 0.0 self._mode_episode_count = 0 else: self._mode_episode_count += 1 elif self._mode != -1: # and thus mode == -1 self._mode = -1 print("Dead at score {}".format(self._ple.game.getScore())) self._ple.reset_game() # for _ in range(self._random_state.randint(15)): # self._ple.act(self._ple.NOOP) # self._screen = self._ple.getScreenGrayscale() # cv2.resize(self._screen, (48, 48), # self._reduced_screen, # interpolation=cv2.INTER_NEAREST) return [self.memSize * [self.width * [self.height * [0]]]] def act(self, action): action = self._actions[action] reward = 0 for _ in range(self._frame_skip): reward += self._ple.act(action) if self.inTerminalState(): break self._screen = self._ple.getScreenGrayscale() self._reduced_screen = cv2.resize(self._screen, (self.height, self.width), interpolation=cv2.INTER_NEAREST) cv2.imshow("debug", self._reduced_screen.T) cv2.waitKey(1) self._mode_score += reward return np.sign(reward) def summarizePerformance(self, test_data_set): if self.inTerminalState() == False: self._mode_episode_count += 1 mean = (self._mode_score / self._mode_episode_count if self._mode_episode_count else "N/A") print("== Mean score per episode is {} over {} episodes ==".format( mean, self._mode_episode_count)) def inputDimensions(self): return [(self.memSize, self.width, self.height)] def observationType(self, subject): return np.float32 def nActions(self): return len(self._actions) def observe(self): return [np.array(self._reduced_screen) / 256.] def inTerminalState(self): return self._ple.game_over()
fps = 30 #fps we want to run at frame_skip = 2 num_steps = 1 force_fps = False #slower speed display_screen = True reward = 0.0 max_noops = 20 nb_frames = 15000 #make a PLE instance. p = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) #our Naive agent! agent = NaiveAgent(p.getActionSet()) #init agent and game. p.init() #lets do a random number of NOOP's for i in range(np.random.randint(0, max_noops)): reward = p.act(p.NOOP) #start our training loop for f in range(nb_frames): #if the game is over if p.game_over(): p.reset_game() obs = p.getScreenRGB()
EPSILON_DECAY = EPOCHS * STEPS_PER_EPOCHS EPSILON_MIN = 0.01 EPSILON_DECAY_V = (EPSILON_MIN - EPSILON_START) / EPSILON_DECAY SEED = 123456 epsilon = EPSILON_START rng = np.random.RandomState(SEED) game = flappy.FlappyClone() env = PLE(game, display_screen=True, force_fps=True, fps=30, state_preprocessor=preprocessor, rng=rng) env.game.rewards["positive"] = 1 # env.game.rewards["tick"] = .01 qAgent = QAgent(env.getActionSet(), [s.size for s in scalers], discount=.99, learningRate=.2, gridSize=GRID_SIZE, epsilon=epsilon) qAgent.jFilePath = os.path.join(folder, qAgent.jFilePath) reward = 0. clock = pygame.time.Clock() laststate = None lastticks = 0 periodJump = 0 action = None nextTest = False for e in range(EPOCHS): avgloss = 0.
from ple.games.flappybird import FlappyBird from ple import PLE from humanagent import HumanAgent game = FlappyBird() p = PLE(game, fps=30, display_screen=True) agent = HumanAgent(allowed_actions=p.getActionSet()) p.init() reward = 0.0 nb_frames = 100 for i in range(nb_frames): if p.game_over(): p.reset_game() observation = p.getScreenRGB() action = agent.pickAction(reward, observation) reward = p.act(action)
def launch(args, defaults, description): """ Execute a complete training run. """ logging.basicConfig(level=logging.INFO) parameters = process_args(args, defaults, description) rewards = {} try: module = importlib.import_module("ple.games.%s" % parameters.game.lower()) game = getattr(module, parameters.game) if parameters.game == "FlappyBird": game = game() elif parameters.game == "WaterWorld": game = game(width=84, height=84, num_creeps=6) else: game = game(width=84, height=84) except: raise ValueError("The game %s could not be found. Try using the classname, it is case sensitive." % parameters.game) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() if parameters.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' env = PLE( game, fps=60, force_fps=parameters.force_fps, display_screen=parameters.display_screen, reward_values=rewards, rng=rng ) num_actions = len(env.getActionSet()) if parameters.nn_file is None: network = q_network.DeepQLearner(defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng) else: handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) agent = ple_agent.NeuralAgent(network, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.replay_start_size, parameters.update_frequency, rng) experiment = ple_experiment.PLEExperiment(env, agent, defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng) env.init() experiment.run()
import numpy as np from ple import PLE from ple.games.waterworld import WaterWorld # lets adjust the rewards our agent recieves rewards = { "tick": -0.01, # each time the game steps forward in time the agent gets -0.1 "positive": 1.0, # each time the agent collects a green circle "negative": -5.0, # each time the agent bumps into a red circle } # make a PLE instance. # use lower fps so we can see whats happening a little easier game = WaterWorld(width=256, height=256, num_creeps=8) p = PLE(game, fps=15, force_fps=False, display_screen=True, reward_values=rewards) # we pass in the rewards and PLE will adjust the game for us p.init() actions = p.getActionSet() for i in range(1000): if p.game_over(): p.reset_game() action = actions[np.random.randint(0, len(actions))] # random actions reward = p.act(action) print "Score: {:0.3f} | Reward: {:0.3f} ".format(p.score(), reward)
class Bot(): """ This is our Test agent. It's gonna pick some actions after training! """ def __init__(self, lr): self.lr = lr self.game = Pixelcopter(width=480, height=480) self.p = PLE(self.game, fps=60, display_screen=True) self.actions = self.p.getActionSet() #def pickAction(self, reward, obs): # return random.choice(self.actions) def frame_step(act_inp): terminal = False reward = self.p.act(act_inp) if self.p.game_over(): self.p.reset_game() terminal = True reward = -1 else: reward = 1 self.score = self.p.getScore() img = self.p.getScreenGrayscale() img = transform.resize(img, (80, 80)) img = np.ravel(exposure.rescale_intensity(img, out_range=(0, 255))) return img, reward, terminal def build_model(self): print("Building the model..") model = Sequential() model.add( Convolution2D(32, 8, 8, subsample=(4, 4), border_mode='same', input_shape=(img_rows, img_cols, img_channels))) #80*80*4 model.add(Activation('relu')) model.add(Convolution2D(64, 4, 4, subsample=(2, 2), border_mode='same')) model.add(Activation('relu')) model.add(Convolution2D(64, 3, 3, subsample=(1, 1), border_mode='same')) model.add(Activation('relu')) model.add(Flatten()) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dense(2)) adam = Adam(lr=self.lr) model.compile(loss='mse', optimizer=adam) self.model = model print("Finished building the model..") def trainNetwork(self, mode): D = deque() x_t, r_0, terminal = self.frame_step(self.actions[0]) x_t = x_t / 255.0 s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) #print (s_t.shape) #need to reshape for keras s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) #1*80*80*4 if mode == 'Run': OBSERVE = 999999999 #We keep observe, never train epsilon = FINAL_EPSILON print("Now we load weight") self.model.load_weights("model.h5") adam = Adam(lr=self.lr) self.model.compile(loss='mse', optimizer=adam) print("Weight load successfully") else: #We go to training mode OBSERVE = OBSERVATION epsilon = INITIAL_EPSILON
import numpy as np from keras.models import load_model from challenge_utils import process_screen from collections import deque from ple.games.flappybird import FlappyBird from ple import PLE deepQnet = load_model('model.h5') game = FlappyBird(graphics="fixed") p = PLE(game, fps=30, frame_skip=1, num_steps=1) list_actions = p.getActionSet() size_img = (80, 80) frameDeque = deque([ np.zeros(size_img), np.zeros(size_img), np.zeros(size_img), np.zeros(size_img) ], maxlen=4) def FlappyPolicy(state, screen): global deepQnet global frameDeque global list_actions x = process_screen(screen) # Reinitialize the deque if we start a new game if not np.any(x[10:, :]): # if everything in front of Flappy is black frameDeque = deque([
class NaiveAgent(): """ This is our naive agent. It picks actions at random! """ def __init__(self, actions): self.actions = actions def pickAction(self, reward, obs): return self.actions[np.random.randint(0, len(self.actions))] ################################### game = Doom(scenario="take_cover") env = PLE(game) agent = NaiveAgent(env.getActionSet()) env.init() reward = 0.0 for f in range(15000): #if the game is over if env.game_over(): env.reset_game() action = agent.pickAction(reward, env.getScreenRGB()) reward = env.act(action) if f > 2000: env.display_screen = True env.force_fps = False
def train(self): """Train.""" logs_path = self.args.logs_path video_path = self.args.video_path restore = self.args.restore train = self.args.train # Initial PLE environment os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" # Design reward reward_values = { "positive": 1, "tick": 0.1, "loss": -1, } # Create FlappyBird game env env = PLE(FlappyBird(), display_screen=False, reward_values=reward_values) # Gets the actions FlappyBird supports action_set = env.getActionSet() replay_buffer = ReplayBuffer(self.hparams.replay_buffer_size) agent = Agent(action_set, self.hparams) # restore model if restore: agent.restore(restore) reward_logs = [] loss_logs = [] for episode in range(1, self.hparams.total_episode + 1): # reset env env.reset_game() env.act(0) obs = convert(env.getScreenGrayscale()) state = np.stack([[obs for _ in range(4)]], axis=0) t_alive = 0 total_reward = 0 if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode: agent.stop_epsilon() frames = [env.getScreenRGB()] while not env.game_over(): action = agent.take_action(state) reward = env.act(action_set[action]) if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode: frames.append(env.getScreenRGB()) obs = convert(env.getScreenGrayscale()) obs = np.reshape(obs, [1, 1, obs.shape[0], obs.shape[1]]) state_new = np.append(state[:, 1:, ...], obs, axis=1) action_onehot = np.zeros(len(action_set)) action_onehot[action] = 1 t_alive += 1 total_reward += reward replay_buffer.append( (state, action_onehot, reward, state_new, env.game_over())) state = state_new # save video if episode % self.hparams.save_video_frequency == 0 and episode > self.hparams.initial_observe_episode: os.makedirs(video_path, exist_ok=True) clip = make_video(frames, fps=60).rotate(-90) clip.write_videofile(os.path.join( video_path, 'env_{}.mp4'.format(episode)), fps=60) agent.restore_epsilon() print('Episode: {} t: {} Reward: {:.3f}'.format( episode, t_alive, total_reward)) # danger mp4list = glob.glob('./video_XXX/*.mp4') if len(mp4list) > 0: latest = mp4list[0] latest_timestamp = os.path.getmtime(mp4list[0]) for mp4 in mp4list: ts = os.path.getmtime(mp4) if (ts > latest_timestamp): latest_timestamp = ts latest = mp4 video = io.open(latest, 'r+b').read() encoded = base64.b64encode(video) ipythondisplay.display( HTML(data='''<video alt="test" autoplay loop controls style="height: 400px;"> <source src="data:video/mp4;base64,{0}" type="video/mp4" /> </video>'''.format(encoded.decode('ascii')))) #end danger else: print("Could not find video") if episode > self.hparams.initial_observe_episode and train: # save model if episode % self.hparams.save_logs_frequency == 0: agent.save(episode, logs_path) np.save(os.path.join(logs_path, 'loss.npy'), np.array(loss_logs)) np.save(os.path.join(logs_path, 'reward.npy'), np.array(reward_logs)) # update target network if episode % self.hparams.update_target_frequency == 0: agent.update_target_network() # sample batch from replay buffer batch_state, batch_action, batch_reward, batch_state_new, batch_over = replay_buffer.sample( self.hparams.batch_size) # update policy network loss = agent.update_Q_network(batch_state, batch_action, batch_reward, batch_state_new, batch_over) loss_logs.extend([[episode, loss]]) reward_logs.extend([[episode, total_reward]]) # print reward and loss if episode % self.hparams.show_loss_frequency == 0: print( 'Episode: {} t: {} Reward: {:.3f} Loss: {:.3f}'.format( episode, t_alive, total_reward, loss)) agent.update_epsilon()
class MyEnv(Environment): VALIDATION_MODE = 0 def __init__(self, rng, game=None, frame_skip=4, ple_options={"display_screen": True, "force_fps":True, "fps":30}): self._mode = -1 self._mode_score = 0.0 self._mode_episode_count = 0 self._frameSkip = frame_skip if frame_skip >= 1 else 1 self._random_state = rng if game is None: raise ValueError("Game must be provided") self._ple = PLE(game, **ple_options) self._ple.init() w, h = self._ple.getScreenDims() self._screen = np.empty((h, w), dtype=np.uint8) self._reducedScreen = np.empty((48, 48), dtype=np.uint8) self._actions = self._ple.getActionSet() def reset(self, mode): if mode == MyEnv.VALIDATION_MODE: if self._mode != MyEnv.VALIDATION_MODE: self._mode = MyEnv.VALIDATION_MODE self._mode_score = 0.0 self._mode_episode_count = 0 else: self._mode_episode_count += 1 elif self._mode != -1: # and thus mode == -1 self._mode = -1 self._ple.reset_game() for _ in range(self._random_state.randint(15)): self._ple.act(self._ple.NOOP) self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) return [4 * [48 * [48 * [0]]]] def act(self, action): action = self._actions[action] reward = 0 for _ in range(self._frameSkip): reward += self._ple.act(action) if self.inTerminalState(): break self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) self._mode_score += reward return np.sign(reward) def summarizePerformance(self, test_data_set): if self.inTerminalState() == False: self._mode_episode_count += 1 print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count)) def inputDimensions(self): return [(4, 48, 48)] def observationType(self, subject): return np.uint8 def nActions(self): return len(self._actions) def observe(self): return [np.array(self._reducedScreen)] def inTerminalState(self): return self._ple.game_over()
#coding:utf-8 from ple.games.pong import Pong from ple import PLE import numpy as np def get_obs(env): # game_state = env.getGameState() # obs = list(game_state.values()) obs = env.getScreenGrayscale() / 255.0 return obs.astype(np.float).ravel() if __name__ == '__main__': game = Pong(width=128, height=96, MAX_SCORE=11) p = PLE(game, fps=30, display_screen=True, force_fps=True) # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) p.getScreenGrayscale() game_state = p.getGameState() print(game_state)