def main(argv): try: opts, _ = getopt.getopt(argv, "hr") except getopt.GetoptError: print("birdML.py [-h | -r]") sys.exit(2) record = False for opt, arg in opts: if opt == '-h': print("-h to help") print("-r record") elif opt == '-r': record = True netb = netBrain() netb.summary() game = FlappyBird() p = PLE(game, fps=30, display_screen=True, force_fps=True) p.init() actions = p.getActionSet() out = 1 epochs = 50 for i in range(epochs): lstates = [] rewards = [] if record: fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = cv2.VideoWriter('Videos/test_' + str(i) + '.mov', fourcc, 30.0, (288, 512)) for d in range(10): while not p.game_over(): if record: obs = p.getScreenRGB() obs = cv2.transpose(obs) obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR) out.write(obs) st = game.getGameState() gstate = list(st.values()) gstate = np.array([np.array(gstate)]) lstates.append(gstate[0]) pred = netb.predict(gstate)[0] a = pred.argmax() p.act(actions[a]) if st['next_pipe_bottom_y'] < st['player_y']: pred[0] = 1.0 pred[1] = 0.0 elif st['next_pipe_top_y'] > st['player_y']: pred[0] = 0.0 pred[1] = 1.0 rewards.append(pred) p.reset_game() netb.fit(np.array(lstates), np.array(rewards), batch_size=10, epochs=10) if record: out.release()
def __init__(self): self.model = Model() self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=False) self.env.init() self.env.getGameState = self.game.getGameState self.es = EvolutionStrategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE) self.exploration = self.INITIAL_EXPLORATION
def train(FRAME_TRAIN=1000005): game = FlappyBird() p = PLE(game, fps=30, display_screen=True) p.init() ob = game.getGameState() state = ob state = np.reshape(np.asarray(list(state.values())), [1, 8]) total_reward = 0 agent = DDQN_Agent.DeepQAgent() agent.load('model95000') batch_size = 32 my_timer = time.time() prev_frame = 0 data = [] for i in range(FRAME_TRAIN): if p.game_over(): data.append(total_reward) p.reset_game() print( "Total reward = {}, Frame = {}, epsilon = {}, frame/second = {}" .format(total_reward, i, agent.epsilon, (i - prev_frame) / (time.time() - my_timer))) total_reward = 0 prev_frame = i my_timer = time.time() # get action from agent action = agent.act(state) # take action reward = p.act(p.getActionSet()[action]) # making the reward space less sparse if reward < 0: reward = -1 total_reward += reward next_state = np.asarray(list(game.getGameState().values())) next_state = np.reshape(next_state, [1, 8]) # remember and replay agent.remember(state, action, reward, next_state, p.game_over()) if len(agent.memory) > batch_size: agent.replay(batch_size) state = next_state # save Model if i % 5000 == 0: print("Updating weights") agent.save('newmodel' + str(i)) agent.target_model.set_weights(agent.model.get_weights()) # Plot socre if i % 1000 == 0: plot(data)
def prepare_game(): asset_dir = "../assets" game = FlappyBird() for c in game.images["player"]: image_assets = [ os.path.join(asset_dir, "bird-upflap.png"), os.path.join(asset_dir, "bird-midflap.png"), os.path.join(asset_dir, "bird-downflap.png"), ] game.images["player"][c] = [pygame.image.load(im).convert_alpha() for im in image_assets] for b in game.images["background"]: game.images["background"][b] = pygame.image.load(os.path.join(asset_dir, "background.png")).convert() for c in ["red", "green"]: path = os.path.join(asset_dir, "pipe.png") game.images["pipes"][c] = {} game.images["pipes"][c]["lower"] = pygame.image.load(path).convert_alpha() game.images["pipes"][c]["upper"] = pygame.transform.rotate(game.images["pipes"][c]["lower"], 180) game.images["base"] = pygame.image.load(os.path.join(asset_dir, "base.png")).convert() return game
def run_game(nb_episodes, agent): """ Runs nb_episodes episodes of the game with agent picking the moves. An episode of FlappyBird ends with the bird crashing into a pipe or going off screen. """ reward_values = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": 0.0, "win": 0.0 } # TODO: when training use the following instead: # reward_values = agent.reward_values env = PLE(FlappyBird(), fps=30, display_screen=True, force_fps=True, rng=None, reward_values=reward_values) # TODO: to speed up training change parameters of PLE as follows: # display_screen=False, force_fps=True env.init() score = 0 tot_nb_episodes = nb_episodes average = 0 highscore = 0 over_50_count = 0 while nb_episodes > 0: # pick an action # TODO: for training using agent.training_policy instead state, ignore = agent.state_binner(env.game.getGameState()) action = agent.policy(state) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # TODO: for training let the agent observe the current state transition score += reward # reset the environment if the game is over if env.game_over() or score >= 60: average += score if score > highscore: highscore = score if score >= 50: over_50_count += 1 print("score for this episode: %d" % score) env.reset_game() nb_episodes -= 1 score = 0 print("Average for {} runs {:.2f}".format(tot_nb_episodes, average / tot_nb_episodes)) over_50_p = (over_50_count / tot_nb_episodes) * 100 print("The percentage of scores over 50 is: %d" % (over_50_p)) return over_50_p
def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState def conv_layer(x, conv, stride = 1): return tf.nn.conv2d(x, conv, [1, stride, stride, 1], padding = 'SAME') def pooling(x, k = 2, stride = 2): return tf.nn.max_pool(x, ksize = [1, k, k, 1], strides = [1, stride, stride, 1], padding = 'SAME') self.X = tf.placeholder(tf.float32, [None, 80, 80, 4]) self.Y = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE]) w_conv1 = tf.Variable(tf.truncated_normal([8, 8, 4, 32], stddev = 0.1)) conv1 = tf.nn.relu(conv_layer(self.X, w_conv1, stride = 4)) pooling1 = pooling(conv1) w_conv2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev = 0.1)) conv2 = tf.nn.relu(conv_layer(pooling1, w_conv2, stride = 2)) w_conv3 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev = 0.1)) conv3 = tf.nn.relu(conv_layer(conv2, w_conv3)) pulling_size = int(conv3.shape[1]) * int(conv3.shape[2]) * int(conv3.shape[3]) conv3 = tf.reshape(conv3, [-1, pulling_size]) tensor_action, tensor_validation = tf.split(conv3,2,1) w_action = tf.Variable(tf.truncated_normal([pulling_size // 2, self.OUTPUT_SIZE], stddev = 0.1)) w_validation = tf.Variable(tf.truncated_normal([pulling_size // 2, 1], stddev = 0.1)) fc_action = tf.matmul(tensor_action, w_action) fc_validation = tf.matmul(tensor_validation, w_validation)) self.logits = fc_validation + tf.subtract(fc_action,tf.reduce_mean(fc_action,axis=1,keep_dims=True)) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate = self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = []
def __init__(self, env_name, args, atari_wrapper=False, test=False, seed=595): game = FlappyBird(width=144, height=256, pipe_gap=80) self.test = test #define reward reward_func = rewards = { "positive": 1, "negative": -1.0, "tick": 1, "loss": -5.0, "win": 1.0 } self.p = PLE(game, fps=30, display_screen=False, force_fps=True, reward_values=reward_func, rng=seed) self.observation = np.zeros((144, 256, 4, 3)) # if atari_wrapper: # clip_rewards = not test # self.env = make_wrap_atari(env_name, clip_rewards) # else: # self.env = gym.make(env_name) self.action_space = self.p.getActionSet()
def __init__(self, model, screen=False, forcefps=True): self.model = model self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.es = Deep_Evolution_Strategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE)
def main(): env = FlappyBird() penv = PLE(env, fps=30, display_screen=True, force_fps=True) #penv.init() np.random.seed(0) obs_shape = len(penv.getGameState()) IMG_shape = penv.getScreenGrayscale().shape action_dim = len(penv.getActionSet()) print(obs_shape, action_dim) rpm = ReplayMemory(MEMORY_SIZE) model = Model(act_dim=action_dim) algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.15, # explore 0.1 e_greed_decrement=1e-6 #1e-6 ) # probability of exploring is decreasing during training # 加载模型 if os.path.exists('./dqn_model.ckpt'): save_path = './dqn_model.ckpt' agent.restore(save_path) print("模型加载成功") eval_reward = evaluate(agent, penv)
def train(nb_episodes, agent): reward_values = agent.reward_values() env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() score = 0 biggest_score = -50000 avg_score = 0 episodes = 0 to_break = False while nb_episodes > 0: # pick an action state = env.game.getGameState() state = agent.state_binner(state) action = agent.training_policy(state) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # let the agent observe the current state transition newState = env.game.getGameState() newState = agent.state_binner(newState) agent.observe(state, action, reward, newState, env.game_over()) agent.frames += 1 score += reward if ((agent.frames % 10000) == 0): to_break = True # reset the environment if the game is over if env.game_over(): avg_score += score if score > biggest_score: biggest_score = score if biggest_score > 450: break print(biggest_score) print(nb_episodes) if nb_episodes % 100 == 0: print(avg_score / 100) if avg_score / 100 >= 5: break avg_score = 0 if to_break: break #print("score for this episode: %d" % score) env.reset_game() nb_episodes -= 1 score = 0 return biggest_score
def __init__(self, game="pixelcopter", fps=30): os.environ['SDL_VIDEODRIVER'] = 'dummy' self.game_name = game if game == "flappy": engine = FlappyBird() elif game == "pixelcopter": engine = Pixelcopter() else: assert False, "This game is not available" engine.rewards["loss"] = -5 # reward at terminal state self.reward_terminal = -5 self.game = PLE(engine, fps=fps, display_screen=False) self.game.init() self.game.act(0) # Start the game by providing arbitrary key as input self.key_input = self.game.getActionSet() self.reward = 0
def __init__(self, display=False): """ Initializes a new environment for FlappyBird game. """ game = game = FlappyBird() self._game = PLE(game, fps=30, display_screen=display) # _display_game flag controls whether or not to render the state that is being provided by the # environment. self._display_game = display if self._display_game: self._display = self.show_img() # display sets up a cv2 window where the current state is displayed. self._display.__next__() # iterate over the display generator. self.NUM_ACTIONS = len(self._game.getActionSet()) # defines the number of action agent can take in the environment. self._ACTION_MAP = {} for i, action in enumerate(self._game.getActionSet()): self._ACTION_MAP[i] = action # Number contiguous images the environment provides as state. Basically at any time, the # environment provides a stack of last 4 (including the current) images as the state to the agent. self._IMAGE_STACK_SIZE = 4 # Dimension of the (greyscale) image provided as state. self._PROCESSED_IMAGE_SIZE = 84 # Determines the number of times the provided action is executed before returning the next # state. self._SKIP_FRAMES = 4 # Used by the RL agent to set up it's CNN model. self.STATE_SPACE = (self._PROCESSED_IMAGE_SIZE, self._PROCESSED_IMAGE_SIZE, self._IMAGE_STACK_SIZE) self._init_states()
def play(self, n=1, file_path=None): # use "Fancy" for full background, random bird color and random pipe color, # use "Fixed" (default) for black background and constant bird and pipe colors. game = FlappyBird(graphics="fixed") # Note: if you want to see you agent act in real time, set force_fps to False. # But don't use this setting for learning, just for display purposes. env = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Init the environment (settings, display...) env.init() # Load the model model = load_model(file_path) # Let's play n games, and see if the model is correctly trained for _ in range(n): env.reset_game() while not env.game_over(): S = self.get_game_data(game) Q = model.predict(S, batch_size=1) A = np.argmax(Q[0]) env.act(self.ACTIONS[A])
def __init__(self, playback_mode, mod=None): self._playback_mode = playback_mode env = FlappyBird(pipe_gap=200) self._ple = PLE(env, fps=30, display_screen=DISPLAY) self._ple.init() self._sess = tf.Session() self._agent = DDQNAgent(self._sess, DIM_STATE, DIM_ACTION, LR, TAU, net_name='cnn_bird') self._sess.run(tf.global_variables_initializer()) self._agent.update_target_paras() self._saver = tf.train.Saver() self._replay_buffer = ReplayBuffer(BUFFER_SIZE) self._explorer = Explorer(EPS_BEGIN, EPS_END, EPS_STEPS, playback_mode) self.summary = Summary(self._sess, DIR_SUM) self.summary.add_variable(tf.Variable(0.), 'reward') self.summary.add_variable(tf.Variable(0.), 'loss') self.summary.add_variable(tf.Variable(0.), 'maxq') self.summary.build() self.summary.write_variables(FLAGS) self._steps = 0 if mod and os.path.exists(FLAGS.dir_mod.format(mod)): checkpoint = tf.train.get_checkpoint_state(FLAGS.dir_mod.format(mod)) self._saver.restore(self._sess, save_path=checkpoint.model_checkpoint_path) print("Loaded checkpoints {0}".format(checkpoint.model_checkpoint_path))
def _test_ple(): from ple.games.pong import Pong from ple.games.flappybird import FlappyBird from ple import PLE # os.environ['SDL_VIDEODsRIVER'] = 'dummy' game = Pong() game = FlappyBird() ple_game = PLE(game, fps=30, display_screen=True) ple_game.init() ALLOWED_ACTIONS = ple_game.getActionSet() print(ALLOWED_ACTIONS) action = 0 start = time() t = 0 while True: ep_reward = 0 ple_game.reset_game() while not ple_game.game_over(): sleep(0.1) t += 1 if t % 15 == 5: action = 0 else: action = 1 reward = ple_game.act(ALLOWED_ACTIONS[action]) # print(reward) ep_reward += reward print(ep_reward, t, t / (time() - start))
def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.X = tf.placeholder(tf.float32, (None, self.INPUT_SIZE)) self.REWARDS = tf.placeholder(tf.float32, (None)) self.ACTIONS = tf.placeholder(tf.int32, (None)) input_layer = tf.Variable( tf.random_normal([self.INPUT_SIZE, self.LAYER_SIZE])) bias = tf.Variable(tf.random_normal([self.LAYER_SIZE])) output_layer = tf.Variable( tf.random_normal([self.LAYER_SIZE, self.OUTPUT_SIZE])) feed_forward = tf.nn.relu(tf.matmul(self.X, input_layer) + bias) self.logits = tf.nn.softmax(tf.matmul(feed_forward, output_layer)) indexes = tf.range(0, tf.shape(self.logits)[0]) * tf.shape( self.logits)[1] + self.ACTIONS responsible_outputs = tf.gather(tf.reshape(self.logits, [-1]), indexes) self.cost = -tf.reduce_mean(tf.log(responsible_outputs) * self.REWARDS) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = []
def train(nb_frames, agent): reward_values = agent.reward_values() env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() score = 0 biggest_score = -5 avg_score = 0 avrage = [] count = [] nb_episodes = 0 number_of_frames = 0 while number_of_frames < nb_frames: # pick an action state = env.game.getGameState() state = agent.state_binner(state) action = agent.training_policy(state) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # let the agent observe the current state transition newState = env.game.getGameState() newState = agent.state_binner(newState) agent.observe(state, action, reward, newState, env.game_over()) score += reward number_of_frames += 1 # reset the environment if the game is over if env.game_over(): nb_episodes += 1 avg_score += score if score > biggest_score: biggest_score = score print(biggest_score) print(nb_episodes) print(number_of_frames) if nb_episodes % 100 == 0: print(avg_score / 100) avrage.append(avg_score / 100) count.append(number_of_frames) avg_score = 0 #print("score for this episode: %d" % score) agent.calculate() env.reset_game() score = 0 print(biggest_score) data = {"Count": count, "Avrage": avrage} df = pd.DataFrame(data) sns.relplot(x="Count", y="Avrage", ci=None, kind="line", data=df)
def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.X = tf.placeholder(tf.float32, (None, None, self.INPUT_SIZE)) self.Y = tf.placeholder(tf.float32, (None, self.OUTPUT_SIZE)) cell = tf.nn.rnn_cell.LSTMCell(512, state_is_tuple=False) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * 512)) self.rnn, self.last_state = tf.nn.dynamic_rnn( inputs=self.X, cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) action_layer = tf.Variable( tf.random_normal([layer_size // 2, output_size])) validation_layer = tf.Variable(tf.random_normal([layer_size // 2, 1])) tensor_action, tensor_validation = tf.split(self.rnn[:, -1, :], 2, 1) feed_action = tf.matmul(tensor_action, action_layer) feed_validation = tf.matmul(tensor_validation, validation_layer) self.logits = feed_validation + tf.subtract( feed_action, tf.reduce_mean(feed_action, axis=1, keep_dims=True)) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = []
def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.X = tf.placeholder(tf.float32, (None, None, input_size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) cell = tf.nn.rnn_cell.LSTMCell(512, state_is_tuple=False) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * 512)) self.rnn, self.last_state = tf.nn.dynamic_rnn( inputs=self.X, cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) w = tf.Variable(tf.random_normal([512, output_size])) self.logits = tf.matmul(self.rnn[:, -1], w) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = []
def test_agent(policy, file_writer=None, test_games=10, step=0): game = FlappyBird() env = PLE(game, fps=30, display_screen=False) env.init() test_rewards = [] for _ in range(test_games): env.reset_game() no_op(env) game_rew = 0 while not env.game_over(): state = flappy_game_state(env) action = 119 if policy(state) == 1 else None for _ in range(2): game_rew += env.act(action) test_rewards.append(game_rew) if file_writer is not None: summary = tf.Summary() summary.value.add(tag='test_performance', simple_value=game_rew) file_writer.add_summary(summary, step) file_writer.flush() return test_rewards
def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.X = tf.placeholder(tf.float32, (None, self.INPUT_SIZE)) self.Y = tf.placeholder(tf.float32, (None, self.OUTPUT_SIZE)) input_layer = tf.Variable( tf.random_normal([self.INPUT_SIZE, self.LAYER_SIZE])) bias = tf.Variable(tf.random_normal([self.LAYER_SIZE])) action_layer = tf.Variable( tf.random_normal([self.LAYER_SIZE // 2, self.OUTPUT_SIZE])) validation_layer = tf.Variable( tf.random_normal([self.LAYER_SIZE // 2, 1])) feed_forward = tf.nn.relu(tf.matmul(self.X, input_layer) + bias) self.tensor_action, self.tensor_validation = tf.split( feed_forward, 2, 1) self.feed_action = tf.matmul(self.tensor_action, action_layer) self.feed_validation = tf.matmul(self.tensor_validation, validation_layer) self.logits = self.feed_validation + tf.subtract( self.feed_action, tf.reduce_mean(self.feed_action, axis=1, keep_dims=True)) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = []
def main_test(): final_score = 0 previous_action = 1 model = build_neural_network_model() game = FlappyBird(width=288, height=512, pipe_gap=100) env = PLE(game, fps=30, display_screen=True, state_preprocessor=process_state) model = load_model("model.h5") env.init() passed = 0 old_y = 0 for i in range(game_steps): if i == game_steps - 1: print("Score: {}".format(final_score)) if env.game_over(): print("Final Score: {}".format(final_score)) time.sleep(1) final_score = 0 env.reset_game() observation = env.getGameState() vector = model.predict(np.matrix(list(observation[0].values()))) a_star = np.argmax(vector[0]) print(vector[0][0], vector[0][1], a_star) time.sleep(0.05) env_reward = env.act(env.getActionSet()[a_star]) if env_reward == 1: final_score += 1
def score(self, training=True, nb_episodes=10): reward_values = { 'positive': 1.0, 'negative': 0.0, 'tick': 0.0, 'loss': 0.0, 'win': 0.0 } env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() total_episodes = nb_episodes score = 0 scores = [] while nb_episodes > 0: # pick an action state = env.game.getGameState() action = self.policy(state) # step the environment reward = env.act(env.getActionSet()[action]) score += reward # reset the environment if the game is over if env.game_over() or score >= 100: scores.append(score) env.reset_game() nb_episodes -= 1 score = 0 avg_score = sum(scores) / float(len(scores)) print('Games played: {}'.format(total_episodes)) print('Average score: {}'.format(avg_score)) if training: score_file = '{}/scores.csv'.format(self.name) # If file doesn't exist, add the header if not os.path.isfile(score_file): with open(score_file, 'a') as f: f.write('avg_score,episode_count,num_of_frames,min,max\n') # Append scores to the file with open(score_file, 'a') as f: f.write('{},{},{},{},{}\n'.format(avg_score, self.num_of_episodes, self.num_of_frames, min(scores), max(scores))) else: with open('scores.txt', 'a') as f: for score in scores: f.write('{},{}\n'.format(self.name, score))
def main(): env = FlappyBird() penv = PLE(env, fps=30, display_screen=True,force_fps=True) #penv.init() np.random.seed(0) obs_shape = len(penv.getGameState()) IMG_shape = penv.getScreenGrayscale().shape action_dim = len(penv.getActionSet()) print(obs_shape,action_dim) rpm = ReplayMemory(MEMORY_SIZE) model = Model(act_dim=action_dim) algorithm = parl.algorithms.DQN( model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.15, # explore 0.1 e_greed_decrement=1e-6 #1e-6 ) # probability of exploring is decreasing during training # 加载模型 if os.path.exists('./dqn_model.ckpt'): save_path = './dqn_model.ckpt' agent.restore(save_path) print("模型加载成功") while len(rpm) < MEMORY_WARMUP_SIZE: # warm up replay memory run_episode(agent, penv, rpm) max_episode = 1000 # start train episode = 0 while episode < max_episode: # train part for i in range(0, 50): total_reward = run_episode(agent, penv, rpm) episode += 1 eval_reward = evaluate(agent, penv) logger.info('episode:{} test_reward:{}'.format( episode, eval_reward)) # 训练结束,保存模型 save_path = './model/dqn_model_{}_{}.ckpt'.format(episode, eval_reward) agent.save(save_path) # 训练结束,保存模型 save_path = './dqn_model.ckpt' agent.save(save_path)
def train(nb_frames, agent, a, g, results): print("alpha %f" % a) print("gamma %f" % g) reward_values = agent.reward_values() env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() score = 0 biggest_score = -5 avg_score = 0 number_of_frames = 0 nb_episodes = 0 while number_of_frames < nb_frames: # pick an action state = env.game.getGameState() state = agent.state_binner(state) action = agent.training_policy(state) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # let the agent observe the current state transition newState = env.game.getGameState() newState = agent.state_binner(newState) agent.observe(state, action, reward, newState, env.game_over()) score += reward number_of_frames += 1 # reset the environment if the game is over if env.game_over(): nb_episodes += 1 avg_score += score if score > biggest_score: biggest_score = score print(biggest_score) print(nb_episodes) print(number_of_frames) if nb_episodes % 100 == 0: print(avg_score / 100) results[0].append(avg_score / 100) results[1].append(number_of_frames) results[2].append(a) results[3].append(g) avg_score = 0 #print("score for this episode: %d" % score) env.reset_game() score = 0 print(biggest_score) return results
def __init__(self): self.game = FlappyBird() self.p = PLE(self.game, fps=30, display_screen=True) # self.actions = self.p.getActionSet() # self._action_space = list(range(self.actions[0])) # self._action_space.append(self.actions[-1]) self.action_space = self.p.getActionSet()
def __init__(self, render=False, seed=0, pipe_gap=100): self.seed = seed print('SEED: {}'.format(self.seed)) game = FlappyBird(pipe_gap=pipe_gap) self.env = PLE(game, fps=30, display_screen=render, rng=seed) self.env.init() self.full_state = np.zeros((1, 4, 80, 80), dtype=np.uint8) self.frame_sleep = 0.02
def train(FRAME_TRAIN=1000005): game = FlappyBird() p = PLE(game, fps=30, display_screen=True) p.init() ob = game.getGameState() state = ob state = np.reshape(np.asarray(list(state.values())), [1, 8]) total_reward = 0 agent = DDQN_Agent.DeepQAgent() agent.load("model95000") batch_size = 32 my_timer = time.time() prev_frame = 0 data = [] for i in range(FRAME_TRAIN): if p.game_over(): data.append(total_reward) p.reset_game() print( "Total reward = {}, Frame = {}, epsilon = {}, frame/second = {}" .format(total_reward, i, agent.epsilon, (i - prev_frame) / (time.time() - my_timer))) total_reward = 0 prev_frame = i my_timer = time.time() # get action from agent action = agent.act(state) # take action reward = p.act(p.getActionSet()[action]) # making the reward space less sparse if reward < 0: reward = -1 total_reward += reward next_state = np.asarray(list(game.getGameState().values())) next_state = np.reshape(next_state, [1, 8]) state = next_state # time.sleep(0.3) # Plot socre if i % 1000 == 0: plot(data)
def __init__(self, display_screen): self.width = IMAGE_WIDTH self.height = IMAGE_HEIGHT self.count = 0 self.p = PLE(FlappyBird(), fps=30, display_screen=display_screen) self.p.init() self._update_state() self.score = 0
def __init__(self): self.game = FlappyBird(pipe_gap=110) self.env = PLE(self.game, fps=30, display_screen=False) self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary # by convention we want to use (0,1) # but the game uses (None, 119) self.action_map = self.env.getActionSet() # [None, 119]
# You're not allowed to change this file from ple.games.flappybird import FlappyBird from ple import PLE import numpy as np from FlappyAgent import FlappyPolicy game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. p.init() reward = 0.0 nb_games = 100 cumulated = np.zeros((nb_games)) for i in range(nb_games): p.reset_game() while(not p.game_over()): state = game.getGameState() screen = p.getScreenRGB() action=FlappyPolicy(state, screen) ### Your job is to define this function. reward = p.act(action) cumulated[i] = cumulated[i] + reward average_score = np.mean(cumulated) max_score = np.max(cumulated)