def train_agent(args): # if gpu is to be used device = torch.device( "cuda" if torch.cuda.is_available() and args.ngpu > 0 else "cpu") # Build env (first level, right only) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) # setup networks init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space args.n_actions = env.action_space.n policy_net = DQN(screen_height, screen_width, args.n_actions).to(device) target_net = DQN(screen_height, screen_width, args.n_actions).to(device) if args.targetNet: target_net.load_state_dict( torch.load(args.targetNet, map_location=device)) if args.policyNet: target_net.load_state_dict( torch.load(args.policyNet, map_location=device)) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) args.steps_done = 0 num_episodes = 1 for i_episode in range(num_episodes): # Initialize the environment and state env.reset() last_screen = get_screen(env, device) current_screen = get_screen(env, device) state = current_screen - last_screen for t in count(): # Select and perform an action action = select_action(state, policy_net, args, device) _, reward, done, _ = env.step(action.item()) reward = torch.tensor([reward], device=device) # Observe new state last_screen = current_screen current_screen = get_screen(env, device) if not done: next_state = current_screen - last_screen else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # Perform one step of the optimization (on the target network) optimize_model(optimizer, memory, policy_net, target_net, args, device) if done: episode_durations.append(t + 1) break # Update the target network, copying all weights and biases in DQN if i_episode % args.target_update == 0: target_net.load_state_dict(policy_net.state_dict()) torch.save(policy_net.state_dict(), args.output_policyNet) torch.save(target_net.state_dict(), args.output_targetNet) if i_episode % 10 == 0: print(f'{i_episode+1}/{num_episodes}: Completed Episode.') print('Complete') env.close() torch.save(policy_net.state_dict(), args.output_policyNet) torch.save(target_net.state_dict(), args.output_targetNet)
def process_single_session(session_path, output_path=None, render=False, length=None): with open(session_path) as json_file: data = json.load(json_file) if output_path is not None: output_path = Path(output_path) output_path.mkdir(exist_ok=True, parents=True) shutil.copyfile(session_path, output_path.joinpath("data.json")) output_path.joinpath("frames").mkdir(exist_ok=True) first_world = "SuperMarioBros-1-1-v0" env = gym_super_mario_bros.make(first_world) next_state = env.reset() world = 1 stage = 1 stage_num = 0 frame_number = 0 steps = 0 for i, action in enumerate(data["obs"]): if length is not None: if i >= length: break if render: env.render() next_state, _, done, info = env.step(action) steps += 1 if output_path is not None: cvt_state = cv2.cvtColor(next_state, cv2.COLOR_BGR2RGB) impath = str( output_path.joinpath(f"frames/frame_{frame_number}.png")) cv2.imwrite(impath, cvt_state) finish = False frame_number += 1 if info["flag_get"]: finish = True if done: done = False if finish or steps >= 16000: stage_num += 1 world, stage, new_world = make_next_stage( world, stage, stage_num) env.close() env = gym_super_mario_bros.make(new_world) finish = False steps = 0 next_state = env.reset()
def run(training_mode, pretrained): env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = make_env(env) # Wraps the environment so that frames are grayscale observation_space = env.observation_space.shape action_space = env.action_space.n agent = DQNAgent(state_space=observation_space, action_space=action_space, max_memory_size=30000, batch_size=64, gamma=0.90, lr=0.00025, dropout=0., exploration_max=1.0, exploration_min=0.02, exploration_decay=0.99, double_dq=True, pretrained=pretrained) num_episodes = 10001 env.reset() total_rewards = [] open(f'training_log.txt', 'w').write(f'ep_num\tsteps\taction\treward\tterminal\ttotal_reward\n') for ep_num in tqdm(range(num_episodes)): state = env.reset() state = torch.Tensor([state]) total_reward = 0 steps = 0 while True: if not training_mode: show_state(env, ep_num, f"step: {steps} reward: {int(total_reward)}") action = agent.act(state) steps += 1 state_next, reward, terminal, info = env.step(int(action[0])) total_reward += reward state_next = torch.Tensor([state_next]) reward = torch.tensor([reward]).unsqueeze(0) terminal = torch.tensor([int(terminal)]).unsqueeze(0) if training_mode: agent.remember(state, action, reward, state_next, terminal) agent.experience_replay() state = state_next open(f'training_log.txt', 'a').write( f'{ep_num}\t{steps}\t{action.item()}\t{reward.item()}\t{terminal}\t{total_reward}\n' ) if terminal: break total_rewards.append(total_reward) print("Total reward after episode {} is {}".format( ep_num + 1, total_rewards[-1])) num_episodes += 1 if training_mode: with open("ending_position.pkl", "wb") as f: pickle.dump(agent.ending_position, f) with open("num_in_queue.pkl", "wb") as f: pickle.dump(agent.num_in_queue, f) with open("total_rewards.pkl", "wb") as f: pickle.dump(total_rewards, f) if agent.double_dq: torch.save(agent.local_net.state_dict(), "dq1.pt") torch.save(agent.target_net.state_dict(), "dq2.pt") else: torch.save(agent.dqn.state_dict(), "dq.pt") torch.save(agent.STATE_MEM, "STATE_MEM.pt") torch.save(agent.ACTION_MEM, "ACTION_MEM.pt") torch.save(agent.REWARD_MEM, "REWARD_MEM.pt") torch.save(agent.STATE2_MEM, "STATE2_MEM.pt") torch.save(agent.DONE_MEM, "DONE_MEM.pt") env.close() if num_episodes > 500: plt.title("Episodes trained vs. Average Rewards (per 500 eps)") plt.plot([0 for _ in range(500)] + np.convolve( total_rewards, np.ones((500, )) / 500, mode="valid").tolist()) plt.show()
def run(run_name, existing_model): # Create log dir log_dir = "./monitor_logs/" os.makedirs(log_dir, exist_ok=True) print("Setting up environment...") env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) env = EpisodicLifeEnv(env) # Preprocessing env = WarpFrame(env) env = FrameStack(env, n_frames=hp.FRAME_STACK) # Evaluate every kth frame and repeat action env = MaxAndSkipEnv(env, skip=hp.FRAME_SKIP) # Logs will be saved in log_dir/monitor.csv env = Monitor(env, log_dir) # Save a checkpoint every 1000 steps checkpoint_callback = CheckpointCallback(save_freq=25000, save_path='./models/', name_prefix=run_name) eval_callback = EvalCallback(env, best_model_save_path='./models/', log_path='./models/', eval_freq=250000, deterministic=True, render=False) print("Compiling model...") if existing_model: try: model = DQN.load(existing_model, env, tensorboard_log="./mario_tensorboard/") except: print(f"{existing_model} does not exist!") exit(0) else: model = DQN( LnCnnPolicy, env, batch_size=hp. BATCH_SIZE, # Optimizable (higher batch sizes ok according to https://arxiv.org/pdf/1803.02811.pdf) verbose=1, learning_starts=10000, learning_rate=hp.LEARNING_RATE, exploration_fraction=hp.EXPLORATION_FRACT, exploration_initial_eps=1.0, exploration_final_eps=0.1, prioritized_replay=True, prioritized_replay_alpha=hp.P_REPLAY_ALPHA, train_freq=hp.TRAINING_FREQ, target_network_update_freq=hp.TARGET_UPDATE_FREQ, tensorboard_log="./mario_tensorboard/") print("Training starting...") with ProgressBarManager(hp.TIME_STEPS) as progress_callback: model.learn( total_timesteps=hp.TIME_STEPS, log_interval=1, callback=[progress_callback, checkpoint_callback, eval_callback], tb_log_name=run_name) print("Done! Saving model...") model.save("models/{}_final".format(run_name))
self.g_opt.zero_grad() loss.backward() for lp, gp in zip(self.model.l_net.parameters(), self.g_net.parameters()): gp._grad = lp.grad.clone().cpu() self.g_opt.step() self.model.l_net.load_state_dict(self.g_net.state_dict()) if __name__ == '__main__': writer = SummaryWriter('runs/Vanilla') ####### Env Settings ########## env_id = 'SuperMarioBros-v2' env = gym_super_mario_bros.make(env_id) env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) s_dim = 4 # transition a_dim = env.action_space.n env.close() ############################### ####### MultiProcessing Settings ########## num_worker = 1 workers = [] parent_conns = [] queue = Queue() ########################################### ##### Etc Settings ######################## max_episode = 1000000
running_add = reward[t] + args.gamma * running_add * (1 - done[t]) discounted_return[t] = running_add # For Actor adv = discounted_return - value return discounted_return, adv if __name__ == '__main__': args = parser.parse_args() # get enviroment information env = BinarySpaceToDiscreteSpaceEnv( gym_super_mario_bros.make(args.env_id), SIMPLE_MOVEMENT) input_size = env.observation_space.shape output_size = env.action_space.n env.close() # setup current_time = datetime.now().strftime('%b%d_%H-%M-%S') tag = ["test", "train"][int(args.training)] log_dir = os.path.join(args.logdir, '{}_{}_{}_{}'.format( args.env_id, args.name, current_time, tag)) writer = SummaryWriter(log_dir) model_path = 'saved/{}_{}_{}.model'.format(args.env_id, args.name, current_time) load_model_path = 'saved/{}'.format(args.prev_model)
def create_mario_env(env_id): env = gym_super_mario_bros.make(env_id) env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT) env = wrap_mario(env) return env
def main(): movement = SIMPLE_MOVEMENT movement.append(['left', 'A']) movement.append(['left', 'B']) movement.append(['left', 'A', 'B']) movement.append(['B']) movement.append(['down']) movement.append(['up']) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, movement) #channels is acting as the number of frames in history #if resize_height and height are different, assert final_height < resize_height and image will be cropped channels = 4 width = 84 resize_height = 110 final_height = 84 epsilon = 0.0 use_cuda = torch.cuda.is_available() torch.manual_seed(1) device = torch.device("cuda" if use_cuda else "cpu") model = simple_net(channels, len(movement), device).to(device) model_file = 'mario_agent' model.load_state_dict(torch.load(model_file)) max_steps = 5000 num_eps = 1 for episode in range(num_eps): print('Episode {}'.format(episode + 1)) state = env.reset() state = preprocess(state, [resize_height, width], final_height) state = torch.cat((state, state, state, state)) action = 0 episode_reward = 0 for step in range(max_steps): if step % 3 == 0: if random.random() < epsilon: action = random.randint(0, len(movement) - 1) else: q_val, action, q_vals = maxQ(state, model, device) next_state, reward, done, info = env.step(int(action)) if reward > 0: reward = 1 else: reward = -1 episode_reward += reward print(reward) next_state = preprocess(next_state, [resize_height, width], final_height) next_state = torch.cat((state[1:, :, :], next_state)) state = next_state env.render() time.sleep(0.03) if done: break env.close()
def main(): movement = SIMPLE_MOVEMENT movement.append(['left', 'A']) movement.append(['left', 'B']) movement.append(['left', 'A', 'B']) #movement.append(['B']) #movement.append(['down']) #movement.append(['up']) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, movement) #channels is acting as the number of frames in history #if resize_height and height are different, assert final_height < resize_height and image will be cropped channels = 3 frames = 4 width = 128 resize_height = 180 final_height = 128 bottom_chop = 15 size = [channels * frames, final_height, width] batch_size = 16 replay_capacity = 100000 replay_dir = '/home-local/bayrakrg/mario_replay/' start_epsilon = 1.0 stop_epsilon = 0.01 epsilon_decay = 0.00005 gamma = 0.75 use_cuda = torch.cuda.is_available() torch.manual_seed(1) device = torch.device("cuda" if use_cuda else "cpu") model = simple_net(channels, len(movement), device).to(device) target_model = simple_net(channels, len(movement), device).to(device) model_file = 'mario_agent' model.load_state_dict(torch.load(model_file)) target_model.load_state_dict(torch.load(model_file)) lr = 0.0001 optimizer = torch.optim.Adam(model.parameters(), lr=lr) total_reward_file = 'total_reward.txt' with open(total_reward_file, 'w') as f: f.write('Reward\tSteps\n') max_steps = 500 num_eps = 10000 data = dataset(replay_capacity, batch_size, replay_dir, 1, size) tau = 0 max_tau = 10000 decay_step = 0 for episode in range(num_eps): print('Episode {}'.format(episode + 1)) state = env.reset() state = preprocess(state, [resize_height, width, 3], final_height, bottom_chop) state = torch.cat((state, state, state, state)) action = 0 episode_reward = 0 for step in range(max_steps): tau += 1 decay_step += 1 epsilon = stop_epsilon + (start_epsilon - stop_epsilon) * np.exp( -epsilon_decay * decay_step) if random.random() < epsilon: action = random.randint(0, len(movement) - 1) else: q_val, action, q_vals = maxQ(state, model, device) next_state, reward, done, info = env.step(int(action)) if step == max_steps - 1: reward -= 10 if reward > 0: reward = 1 else: reward = -1 episode_reward += reward next_state = preprocess(next_state, [resize_height, width, 3], final_height, bottom_chop) next_state = torch.cat((state[3:, :, :], next_state)) trans = transition(state, action, reward, next_state, done) data.add(trans) train(model, device, optimizer, data.get_batch(model, target_model, device, gamma)) state = next_state env.render() if tau > max_tau: target_model.load_state_dict(model.state_dict()) tau = 0 if done: break with open(total_reward_file, 'a') as f: f.write('{}\t{}\n'.format(episode_reward, step)) if episode % 5 == 0: with open(model_file, 'wb') as f: torch.save(model.state_dict(), f) env.close()
Environment '''''''''''''''" Initialize Environment ------------------------ In Mario, the environment consists of tubes, mushrooms and other components. When Mario makes an action, the environment responds with the changed (next) state, reward and other info. """ # Initialize Super Mario environment env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0") # Limit the action-space to # 0. walk right # 1. jump right env = JoypadSpace(env, [["right"], ["right", "A"]]) env.reset() next_state, reward, done, info = env.step(action=0) print(f"{next_state.shape},\n {reward},\n {done},\n {info}") """Preprocess Environment ------------------------ Environment data is returned to the agent in ``next_state``. As you saw above, each state is represented by a ``[3, 240, 256]`` size array. Often that is more information than our agent needs; for instance,
def main(): env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) agent = DQNAgent(action_size=7) scores, episodes, global_step = [], [], 0 global_start = datetime.now() local_start = datetime.now() print() print("=" * 100) print("RL environment initialized") print("=" * 100) print() gc.collect() for e in range(1000): e = e + 1 done = False dead = False step, score, start_life = 0, 0, 5 observe = env.reset() for _ in range(random.randint(1, agent.no_op_steps)): observe, _, _, _ = env.step(1) state = agent.pre_processing(observe) history = np.stack((state, state, state, state), axis=2) history = np.reshape([history], (1, 240, 256, 4)) count_epsilon = 0 count_greedy = 0 coinStatus = 0 marioStatus = "small" flagStatus = False softReward = 0 lifeStatus = 2 while not done: # if agent.render: # env.render() global_step += 1 step += 1 # 바로 전 4개의 상태로 행동을 선택 action, res = agent.get_action(history) if res: count_epsilon += 1 else: count_greedy += 1 # 선택한 행동으로 환경에서 한 타임스텝 진행 observe, reward, done, info = env.step(action) # 각 타임스텝마다 상태 전처리 next_state = agent.pre_processing(observe) next_state = np.reshape([next_state], (1, 240, 256, 1)) next_history = np.append(next_state, history[:, :, :, :3], axis=3) agent.avg_q_max += np.amax(agent.model.predict(np.float32(history / 255.))[0]) if start_life > info['life']: dead = True start_life = info['life'] # reward = np.clip(reward, -1., 1.) real_reward = reward ### ### ### # reward = reward # if coinStatus != info["coins"]: # coinStatus = info["coins"] # reward = reward + 10 # if marioStatus != info["status"]: # marioStatus = info["status"] # reward = reward + 200 # if flagStatus != info["flag_get"]: # flagStatus = info["flag_get"] # reward = reward + 200 # if lifeStatus != info["life"]: # lifeStatus = info["life"] # reward = reward - 20 # # if info["x_pos"] < 10: # info["x_pos"] = 10 # if info["time"] < 10: # info["time"] = 10 # # reward = reward + math.log((info["x_pos"] / info["time"]) + info["x_pos"]) # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장 후 학습 agent.append_sample(history, action, reward, next_history, dead) if len(agent.memory) >= agent.train_start: agent.train_model() # 일정 시간마다 타겟모델을 모델의 가중치로 업데이트 if global_step % agent.update_target_rate == 0: agent.update_target_model() # score += reward score += real_reward if dead: dead = False else: history = next_history if global_step == 0: pass elif global_step % 1000 == 0: print("local step : {}, time : {} sec, epsilon : {}".format(global_step, (datetime.now() - local_start).seconds, agent.epsilon)) local_start = datetime.now() if done: ep_result = "episode : {}, score : {}, memory : {}, step : {}".format(e, score, len(agent.memory), global_step) print(ep_result) print("epsilon : {}, greedy : {}".format(count_epsilon, count_greedy)) print() print("time elapsed : {} sec".format((datetime.now() - global_start).seconds)) global_start = datetime.now() agent.epsilon = agent.epsilon - agent.epsilon_decay_step print("epsilon decay to {}!".format(agent.epsilon)) print() slack_msg(ep_result) # if score > 2000 and score <= 3000: # agent.epsilon = 0.075 # elif score > 3000 and score <= 5000: # agent.epsilon = 0.05 # elif score > 5000 and score <= 10000: # agent.epsilon = 0.005 agent.avg_q_max, agent.avg_loss, global_step = 0, 0, 0 # 1000 에피소드마다 모델 저장 if e == 0: pass elif e % 2 == 0: agent.model.save_weights("./dqn.h5") # dump(agent.memory, "memory.joblib") print("model saved!") print() gc.collect()
def run(self): global episode env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) # env = gym.make(env_name) # env.render() step = 0 gc.collect() while episode < EPISODES: done = False dead = False score, start_life = 0, 5 observe = env.reset() next_observe = observe # 0~30 상태동안 정지 for _ in range(random.randint(1, 30)): observe = next_observe next_observe, _, _, _ = env.step(1) state = pre_processing(next_observe, observe) history = np.stack((state, state, state, state), axis=2) history = np.reshape([history], (1, 240, 256, 4)) coinStatus = 0 marioStatus = "small" flagStatus = False softReward = 0 lifeStatus = 2 while not done: step += 1 self.t += 1 observe = next_observe action, policy = self.get_action(history) # # 1: 정지, 2: 왼쪽, 3: 오른쪽 # if action == 0: # real_action = 1 # elif action == 1: # real_action = 2 # else: # real_action = 3 # # # 죽었을 때 시작하기 위해 발사 행동을 함 # if dead: # action = 0 # real_action = 1 # dead = False # 선택한 행동으로 한 스텝을 실행 next_observe, reward, done, info = env.step(action) # 각 타임스텝마다 상태 전처리 next_state = pre_processing(next_observe, observe) next_state = np.reshape([next_state], (1, 240, 256, 1)) next_history = np.append(next_state, history[:, :, :, :3], axis=3) # 정책의 최대값 self.avg_p_max += np.amax(self.actor.predict(np.float32(history / 255.))) real_reward = reward if start_life > info['life']: dead = True start_life = info['life'] # ### # if coinStatus != info["coins"]: # coinStatus = info["coins"] # reward = reward + 10 # if marioStatus != info["status"]: # marioStatus = info["status"] # reward = reward + 200 # if flagStatus != info["flag_get"]: # flagStatus = info["flag_get"] # reward = reward + 200 # if lifeStatus != info["life"]: # lifeStatus = info["life"] # reward = reward - 200 # # if info["x_pos"] < 10: # info["x_pos"] = 10 # if info["time"] < 10: # info["time"] = 10 # # reward = reward + ((info["x_pos"] / info["time"]) + info["x_pos"]) / 100 score += real_reward # reward = np.clip(reward, -1., 1.) # 샘플을 저장 self.append_sample(history, action, reward) gc.collect() if dead: history = np.stack((next_state, next_state, next_state, next_state), axis=2) history = np.reshape([history], (1, 240, 256, 4)) else: history = next_history # 에피소드가 끝나거나 최대 타임스텝 수에 도달하면 학습을 진행 if self.t >= self.t_max or done: self.train_model(done) self.update_local_model() self.t = 0 if done: # 각 에피소드 당 학습 정보를 기록 episode += 1 ep_res = "episode: {}, score: {}, step: {}".format(episode, score, step) print(ep_res) if episode % 20 == 0: slack_msg(ep_res) # stats = [score, self.avg_p_max / float(step), step] # for i in range(len(stats)): # self.sess.run(self.update_ops[i], feed_dict={ self.summary_placeholders[i]: float(stats[i]) }) # summary_str = self.sess.run(self.summary_op) # self.summary_writer.add_summary(summary_str, episode + 1) self.avg_p_max = 0 self.avg_loss = 0 step = 0
def replay_game_from_actions(action_filepath, video_filepath, video_info_filepath, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) with open(video_info_filepath) as f: video_info = json.load(f) with open(action_filepath) as json_file: data = json.load(json_file) cap = None if os.path.exists(video_filepath): cap = cv2.VideoCapture(video_filepath) first_world = "SuperMarioBros-1-1-v0" env = gym_super_mario_bros.make(first_world) next_state = env.reset() world = 1 stage = 1 stage_num = 0 video_frame_length = 1 / 30 video_start = video_info["start_time"] video_stop = video_info["stop_time"] game_start = data["start_time"] game_stop = data["stop_time"] print("Frame: %s" % str(video_frame_length)) print("VT: %s" % str(video_stop - video_start)) print("GT: %s" % str(game_stop - game_start)) print("VS: %s" % str(video_start)) print("GS: %s" % str(game_start)) skipped_frames = 0 while video_start < game_start: ret, frame = cap.read() video_start += video_frame_length skipped_frames += 1 print("Skipped: %s" % str(skipped_frames)) print("VS: %s" % str(video_start)) print("GS: %s" % str(game_start)) states = [] is_first = True finish = False frame_number = 1 steps = 0 counter = 1 for action in data["obs"]: next_state, reward, done, info = env.step(action) steps += 1 if is_first: is_first = False else: if cap is not None: ret, frame = cap.read() if counter % 30 == 0: cv2.imwrite( os.path.join(output_dir, "face_%s.png" % frame_number), frame) if counter % 30 == 0 or counter % 30 == 1: cvt_state = cv2.cvtColor(next_state, cv2.COLOR_BGR2RGB) cv2.imwrite( os.path.join(output_dir, "game_%s.png" % frame_number), cvt_state) is_first = True frame_number += 1 counter += 1 if info["flag_get"]: finish = True if done: done = False end = time.time() if finish or steps >= 16000: stage_num += 1 world, stage, new_world = make_next_stage( world, stage, stage_num) env.close() env = gym_super_mario_bros.make(new_world) finish = False steps = 0 next_state = env.reset()
def make_env(world, level, v="v1"): env_0 = gym_super_mario_bros.make("SuperMarioBros-" + str(world) + "-" + str(level) + "-" + v) #Same as gym.make return BinarySpaceToDiscreteSpaceEnv(env_0, Moves)
def _make(): env = gym_super_mario_bros.make('SuperMarioBros-1-1-v1') env = JoypadSpace(env, SIMPLE_MOVEMENT) env = wrap_mario(env) return env
def run(self): global episode env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, REALLY_COMPLEX_MOVEMENT) step = 0 while episode < EPISODES: done = False max_x = 40 no_progress = 0 score = 0 state = env.reset() ''' # Making initial history with random actions # Seems to be not needed in LSTM for _ in range(5): next_state = state state, _, _, _ = env.step(random.randint(0, 12)) ''' state = crop_img(state) state = np.reshape([state], (1, 88, 128, 3)) while not done: # Rendering code # Seems to be causing error in Mac OS #if self.thread_count==1: #env.render() step += 1 self.t += 1 action, policy = self.get_action(state) # Taking 6 steps with selected action # Mimicking frame skip for _ in range(6): next_state, reward, done, info = env.step(action) score += reward if done: break # Kill Mario if Mario is making no progress for 10 seconds x_now = info.get('x_pos') # Handling exception x_pos = 65535 if x_now == 65535: x_now = max_x if max_x < x_now: max_x = x_now no_progress = 0 else: no_progress += 1 if no_progress == 200: done = True reward -= 1 print("#",self.thread_count, " STUCK") # Preprocessing each states #next_state = crop_img(next_state) next_state = np.reshape([crop_img(next_state)], (1, 88, 128, 3)) # Average policy max value self.avg_p_max += np.amax(self.actor.predict( np.float32(state / 255.))) score += reward # Appending sample state = next_state self.append_sample(state, action, reward) if self.t >= self.t_max or done: #if done: self.train_model(done) self.update_local_model() #self.reset_lstm_state() self.t = 0 if done: # Recording training information episode += 1 print("#", self.thread_count, " episode:", episode, " score:", format(score, '.2f'), " step:", step, "max_x :", max_x) stats = [score, self.avg_p_max / float(step), step] for i in range(len(stats)): self.sess.run(self.update_ops[i], feed_dict={ self.summary_placeholders[i]: float(stats[i]) }) summary_str = self.sess.run(self.summary_op) self.summary_writer.add_summary(summary_str, episode + 1) self.avg_p_max = 0 self.avg_loss = 0 step = 0
def replay_game_from_actions(action_filepath, video_filepath, video_info_filepath, gap_path, output_dir): stage_order_len = len(_STAGE_ORDER) with open(video_info_filepath) as json_file: video_info = json.load(json_file) cap = cv2.VideoCapture(video_filepath) if not os.path.exists(output_dir): os.makedirs(output_dir) with open(action_filepath) as json_file: data = json.load(json_file) first_world = 'SuperMarioBros-1-1-v0' env = gym_super_mario_bros.make(first_world) next_state = env.reset() start = time.time() world = 1 stage = 1 stage_num = 0 video_frame_length = 1 / 30 video_start = video_info['start_time'] video_stop = video_info['stop_time'] game_start = data['start_time'] game_stop = data['stop_time'] video_time = video_stop - video_start game_time = game_stop - game_start print('Frame: ' + str(video_frame_length)) print('VT:' + str(video_time)) print('GT:' + str(game_time)) print('VS:' + str(video_start)) print('GS:' + str(game_start)) skipped_frames = 0 while video_start < game_start: ret, frame = cap.read() video_start += video_frame_length skipped_frames += 1 print('Skipped: ' + str(skipped_frames)) print('VS:' + str(video_start)) print('GS:' + str(game_start)) is_first = True no = 0 finish = False steps = 0 total_steps = 0 gap_indices = [] counter = 1 for action in data['obs']: env.render() next_state, reward, done, info = env.step(action) steps += 1 total_steps += 1 #Capture 1 game-frames for each video-frame by skipping every 2nd frame cvt_state = cv2.cvtColor(next_state, cv2.COLOR_BGR2RGB) if is_first: is_first = False else: if counter % 30 == 0 or counter % 30 == 1: cv2.imwrite( os.path.join(output_dir, "game_" + str(no) + ".png"), cvt_state) is_first = True no += 1 counter += 1 if info['flag_get']: finish = True if done: done = False end = time.time() if finish or steps >= 16000: stage_num += 1 world, stage, new_world = make_next_stage( world, stage, stage_num) env.close() env = gym_super_mario_bros.make(new_world) finish = False steps = 0 gap_indices.append(total_steps) next_state = env.reset() #Extract video n_gaps = len(gap_indices) n_actions = len(data['obs']) missing = 126000 - n_actions video_frames_to_skip = missing / 2 avg_gap_len = int(video_frames_to_skip / n_gaps) extra = video_frames_to_skip % n_gaps skips = 0 counter = 1 first = True print('Extracting video') for i in range(n_actions): if first: first = False i += 1 else: first = True ret, frame = cap.read() if not ret: break if counter % 30 == 0: cv2.imwrite( os.path.join(output_dir, "face_" + str(counter - 1) + ".png"), frame) i += 1 counter += 1 if i in gap_indices: skips += 1 for j in range(int(avg_gap_len)): ret, frame = cap.read() if extra > 0: ret, frame = cap.read() extra -= 1 i += 1 print('Saving gap_info') gap_info = {} gap_info['indices'] = gap_indices gap_info['missing'] = missing print('Saving gaps to file') with open(gap_path, 'w') as outfile: json.dump(gap_info, outfile)
# -*- coding: utf-8 -*- """ Created on Sun Mar 10 21:00:57 2019 @author: tawehbeysolow """ import numpy as np from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT from algorithms.actor_critic_utilities import train_model from neural_networks.models import ActorCriticModel #Parameters environment = gym_super_mario_bros.make('SuperMarioBros-v0') environment = BinarySpaceToDiscreteSpaceEnv(environment, SIMPLE_MOVEMENT) observation = environment.reset() learning_rate = 1e-4 gamma = 0.96 epsilon = 0.9 n_episodes = 10000 n_steps = 2048 max_steps = int(1e7) _lambda = 0.95 value_coefficient = 0.5 entropy_coefficient = 0.01 max_grad_norm = 0.5 log_interval = 10
return w def renew_w(preferences, dim): w = np.random.randn(reward_size) w = np.abs(w) / np.linalg.norm(w, ord=1, axis=0) preferences[dim] = w return preferences if __name__ == '__main__': args = parser.parse_args() # get enviroment information env = JoypadSpace(gym_super_mario_bros.make(args.env_id), SIMPLE_MOVEMENT) input_size = env.observation_space.shape output_size = env.action_space.n reward_size = 5 env.close() # setup current_time = datetime.now().strftime('%b%d_%H-%M-%S') tag = ["test", "train"][int(args.training)] log_dir = os.path.join( args.logdir, '{}_{}_{}_{}'.format(args.env_id, args.name, current_time, tag)) writer = SummaryWriter(log_dir) model_path = 'saved/{}_{}_{}.model'.format(args.env_id, args.name,
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY import warnings from helper_file import * import matplotlib.pyplot as plt import matplotlib.mlab as mlab warnings.simplefilter("ignore", lineno=148) current_user = getpass.getuser() if (current_user == "gryslik"): model_path = '/Users/gryslik/gitRepos/qlearning/test_code/mario/models6-DDQN/' else: model_path = '/home/ubuntu/data/code/mario/models6-DDQN/' env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') #2-2, 1-1 env = JoypadSpace(env, RIGHT_ONLY) all_files = os.listdir(model_path) if "travel_distance.csv" in all_files: models_processed = pd.read_csv(model_path + "travel_distance.csv")['model_name'].values models_to_compute = [ x for x in all_files if (x not in models_processed and x not in ".DS_Store" and x not in "travel_distance.csv") ] else: models_to_compute = [ item for item in all_files
def __init__(self): super().__init__() self.env = gym_super_mario_bros.make("SuperMarioBros-v0") self.env = JoypadSpace(self.env, COMPLEX_MOVEMENT) self.history_size = 3 self.action_repeats = 6
def createenvironment(enviro, movementset): environment = gym_super_mario_bros.make(enviro) environment = BinarySpaceToDiscreteSpaceEnv(environment, movementset) return environment
import tensorflow as tf # Deep Learning library import numpy as np # Handle matrices import random # used to see if we explore or exploit import warnings # This ignore all the warning messages that are normally printed during the training because of skiimage import os from collections import deque # Ordered collection with ends from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv import gym_super_mario_bros # import Kautenja's gym environment from gym_super_mario_bros.actions import RIGHT_ONLY from skimage import transform # Help us to preprocess the frames from skimage.color import rgb2gray # Help us to gray our frames warnings.filterwarnings('ignore') # used to ignore warning messages # Create our environment env = gym_super_mario_bros.make('SuperMarioBros-v0') # Creates the environment env = BinarySpaceToDiscreteSpaceEnv( env, RIGHT_ONLY) # have to pick complex movement to try different combos #env.render() # updates the action within the game or pretty much shows you the game is playing #print("The size of our frame is: ", env.observation_space) # was originally a test to see what this was outputting. #print("The action size is : ", env.action_space.n) # the amount of actions we can take in the game # Here we create an hot encoded version of our actions # possible_actions = [[1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0]...] possible_actions = np.array( np.identity(env.action_space.n, dtype=int).tolist()) #print("Possible Actions:", possible_actions) print("This is the newest version")
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT env = gym_super_mario_bros.make('SuperMarioBrosNoFrameskip-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) for _ in range(1000): observation = env.reset() done = False t = 0 while not done: observation, reward, done, info = env.step(env.action_space.sample()) env.render() t += 1 if not t % 100: print(t, info) env.close()
return w def renew_w(preferences, dim): w = np.random.randn(reward_size) w = np.abs(w) / np.linalg.norm(w, ord=1, axis=0) preferences[dim] = w return preferences if __name__ == '__main__': args = parser.parse_args() # get enviroment information env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(args.env_id), SIMPLE_MOVEMENT) input_size = env.observation_space.shape output_size = env.action_space.n reward_size = 5 env.close() # setup current_time = datetime.now().strftime('%b%d_%H-%M-%S') tag = ["test", "train"][int(args.training)] log_dir = os.path.join( args.logdir, '{}_{}_{}_{}'.format(args.env_id, args.name, current_time, tag)) writer = SummaryWriter(log_dir)
# C:\Users\Micha\Anaconda3\envs\tensorflow\Lib\site-packages\retro from nes_py.wrappers import JoypadSpace import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT import numpy as np import cv2 import neat import pickle env = gym_super_mario_bros.make('SuperMarioBros-8-3-v1') env = JoypadSpace(env, SIMPLE_MOVEMENT) imgarray = [] xpos_end = 0 resume = True restore_file = "neat-checkpoint-692" def eval_genomes(genome, config): ob = env.reset() ac = env.action_space.sample() inx, iny, inc = env.observation_space.shape inx = int(inx / 8) iny = int(iny / 8) net = neat.nn.recurrent.RecurrentNetwork.create(genome, config) current_max_fitness = 0 fitness_current = 0
from nes_py.wrappers import JoypadSpace import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT from utils import process_frame env = gym_super_mario_bros.make("SuperMarioBros-v1") env = JoypadSpace(env, SIMPLE_MOVEMENT) from random import randint import numpy as np import os import tensorflow as tf from model import generate_model model_file_path = "./nn_model" if os.path.exists(model_file_path): model = tf.keras.models.load_model(model_file_path) else: img_rows, img_cols = 240, 256 model = generate_model((img_rows, img_cols, 3), env.action_space.n) # env.action_space.sample() = numbers, for example, 0,1,2,3... # state = RGB of raw picture; is a numpy array with shape (240, 256, 3) # reward = int; for example, 0, 1 ,2, ... # done = False or True # info = {'coins': 0, 'flag_get': False, 'life': 3, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40} done = True last_state = None identity = np.identity(
def create_mario_env(env_id, reward_type): env = gym_super_mario_bros.make(env_id) env = BinarySpaceToDiscreteSpaceEnv(env, PALETTE_ACTIONS) env = wrap_mario(env, reward_type) return env
state, reward, done, info = env.step( env.action_space.sample()) # enter integer between 0 and 11 # experience replay # loop through epochs # perform action env.render() env.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-e', default='SuperMarioBros-v3', type=str, help='max number of epochs') parser.add_argument('-m', default='human', type=str, help='dimensionality of latent space') ARGS = parser.parse_args() env = gym_super_mario_bros.make('SuperMarioBros-v0') #env = gym_super_mario_bros.make('SuperMarioBrosNoFrameskip-v0') env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT) train(env)
import time from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT if __name__ == "__main__": for ver in range(1, 4): for world in range(1, 9): for stage in range(1, 5): env = gym_super_mario_bros.make(f'SuperMarioBros-{world}-{stage}-v{ver}') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) done = True for step in range(5): if done: state = env.reset() state, reward, done, info = env.step(env.action_space.sample()) env.render() time.sleep(1.) env.close()