def run(self, solution, level, render, mode): env = gym_super_mario_bros.make(level) env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT) done = True reason_finish = "no_more_commands" pos = 0 total_r = 0 for step in range(len(solution)): if done: state = env.reset() state, reward, done, info = env.step(solution[pos]) pos+=1 if reward == -15: #faleceu reason_finish = "death" break if mode == "level" and info['flag_get'] == True: reason_finish = "win" break total_r = total_r + reward if render == "true": env.render() env.close() return total_r, pos, info, reason_finish
def train(env_id, num_timesteps, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) #env = make_atari(env_id) env = gym_super_mario_bros.make('SuperMarioBros-v1') # env = gym_super_mario_bros.make('SuperMarioBrosNoFrameskip-v3') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) env = ProcessFrame84(env) env = FrameMemoryWrapper(env) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) #env = wrap_deepmind(env) env.seed(workerseed) def render_callback(lcl, _glb): # print(lcl['episode_rewards']) total_steps = lcl['env'].total_steps #if total_steps % 1000 == 0: # print("Saving model to mario_model.pkl") # act.save("../models/mario_model_{}.pkl".format(modelname)) env.render() # pass pposgd_simple.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, # 3e-4 optim_batchsize=64, #256 gamma=0.99, lam=0.95, schedule='linear', callback = render_callback ) env.close()
def mariocontext(marioEnv): mario_env = 'SuperMarioBros' + marioEnv.noFrameSkip + '-' + str( marioEnv.world) + '-' + str(marioEnv.stage) + '-v' + str( marioEnv.version) env = gym_super_mario_bros.make(mario_env) env = BinarySpaceToDiscreteSpaceEnv(env, marioEnv.action_encoding) yield env env.close()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) #parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) #parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-freq', type=int, default=10) parser.add_argument('--checkpoint-path', type=str, default=None) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) #env = make_atari(args.env) env = gym_super_mario_bros.make('SuperMarioBros-v3') #env = gym_super_mario_bros.make('SuperMarioBrosNoFrameskip-v3') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) env = ProcessFrame84(env) print("logger.get_dir():", logger.get_dir()) print("PROJ_DIR:", PROJ_DIR) env = bench.Monitor(env, logger.get_dir()) #env = deepq.wrap_atari_dqn(env) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) act = deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=args.num_timesteps, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, checkpoint_freq=args.checkpoint_freq, # checkpoint_path=args.checkpoint_path, #callback=deepq_callback, print_freq=1) print("Saving model to mario_model.pkl") act.save("../models/mario_model_{}.pkl".format( datetime.datetime.now().isoformat())) env.close()
class MarioEnv: def __init__(self, os='mac', display=False): self.display = display if os == 'mac' or os == 'linux': env = gym_super_mario_bros.make('SuperMarioBros-v0') self.env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) else: raise Exception("bad os") self.act_dim = self.env.action_space.n self.obs_dim = (1, 128, 128) print("env created with act_dim", self.act_dim, "obs_dim", self.obs_dim) self.transform = transforms.Compose([ transforms.ToTensor(), # chain 2 transforms together using list. transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) def reset(self): state = self.env.reset() return self.__resize_image(state) def step(self, action): state, reward, done, info = self.env.step(action) if reward == 0: reward = -0.5 state_t = self.__resize_image(state) return state_t, \ np.reshape(reward, -1), \ np.reshape(done, -1) def close(self): self.env.close() def __resize_image(self, state): state_new = cv2.resize(state, (128, 128)) img = Image.fromarray(state_new) state_t = self.transform(img)[0, :, :].unsqueeze(0) state_t = state_t.float().to(DEVICE) return state_t.unsqueeze(0) def render(self): if self.display: self.env.render()
def main(): env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) done = True max_step = 5000 print(env.observation_space.shape) #win下加ascii=True才会不换行 qbar = tqdm(max_step, ascii=True) for step in range(max_step): qbar.update() if done: state = env.reset() action = get_action(state, env.action_space) state, reward, done, info = env.step(action) if done: print(str(step) + " 英雄请卷土重来" + str(info)) env.render() env.close() qbar.close()
## Base model to run the game, using random movements from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv from aux import * import gym_super_mario_bros from gym_super_mario_bros.actions import COMPLEX_MOVEMENT env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT) done = True oldi = { 'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40 } for step in range(100): if done: state = env.reset() state, rwd, done, info = env.step(1) #env.action_space.sample()) print(reward(info, oldi), "vs", rwd) print(env.observation_space.shape) oldi = info env.render() env.close()
def main(): movement = SIMPLE_MOVEMENT movement.append(['left', 'A']) movement.append(['left', 'B']) movement.append(['left', 'A', 'B']) #movement.append(['B']) #movement.append(['down']) #movement.append(['up']) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, movement) #channels is acting as the number of frames in history #if resize_height and height are different, assert final_height < resize_height and image will be cropped channels = 3 frames = 4 width = 128 resize_height = 180 final_height = 128 bottom_chop = 15 epsilon = 0.0 use_cuda = torch.cuda.is_available() torch.manual_seed(1) device = torch.device("cuda" if use_cuda else "cpu") model = simple_net(channels, len(movement), device).to(device) model_file = 'mario_agent' model.load_state_dict(torch.load(model_file)) max_steps = 5000 num_eps = 1 for episode in range(num_eps): print('Episode {}'.format(episode + 1)) state = env.reset() state = preprocess(state, [resize_height, width, 3], final_height, bottom_chop) state = torch.cat((state, state, state, state)) action = 0 episode_reward = 0 for step in range(max_steps): if step % 3 == 0: if random.random() < epsilon: action = random.randint(0, len(movement) - 1) else: q_val, action, q_vals = maxQ(state, model, device) next_state, reward, done, info = env.step(int(action)) if reward > 0: reward = 1 else: reward = -1 episode_reward += reward next_state = preprocess(next_state, [resize_height, width, 3], final_height, bottom_chop) next_state = torch.cat((state[3:, :, :], next_state)) state = next_state env.render() time.sleep(0.03) if done: break env.close()
def replay_genome(genome, movements, gen): env_expanded = gym_super_mario_bros.SuperMarioBrosEnv(frames_per_step=1, rom_mode='vanilla') env = BinarySpaceToDiscreteSpaceEnv(env_expanded, movements) print('Number of genes: ', len(genome.connection_genes)) for gene in genome.connection_genes: print(gene.in_node, gene.out_node, gene.weight, gene.innovation_number, gene.type, gene.enabled) done = True unticked = 0 tick_interval = 1 / 30 last_tick_time = time.time() fps = 0 frames = 0 last_fps_time = time.time() for _ in range(500000): unticked += time.time() - last_tick_time last_tick_time = time.time() ticked = False # while unticked >= tick_interval: if done: state = env.reset() state_downscaled = get_sensor_map(env_expanded) action = genome.calculate_action(state_downscaled) # print('\rFPS: {:.3f}'.format(fps), end=' ') # print(vectofixedstr(action, 10), end=' ') action = np.argmax(action) print('\rtaking action', movements[action], end='', flush=True) state, reward, done, info = env.step(action) #filename = get_path_of('all_pictures/mario/') #imsave(filename + 'mario_' + str(_) + '.png', state) save_state = np.full((13, 10, 3), 255, dtype=np.int) COLORS = [[250, 250, 250], [0, 0, 0], [196, 0, 0], [0, 0, 196]] for i in range(13): for j in range(10): if state_downscaled[(i, j)] == -1: save_state[(i, j)] = COLORS[3] elif state_downscaled[(i, j)] == 0: save_state[(i, j)] = COLORS[0] else: save_state[(i, j)] = COLORS[1] save_state[(7, 2)] = COLORS[2] # filename = get_path_of('all_pictures/input_downscaled/') # imsave(filename + 'state_' + str(_) + '.png', save_state.astype(np.uint8)) # make_controller(movements[action], _, gen) env.render() if info["life"] <= 2: died = True break ticked = True frames += 1 unticked -= tick_interval # if ticked: # now = time.time() # if now - last_fps_time >= 1: # fps = frames / (now - last_fps_time) # last_fps_time = now # frames = 0 # else: # time.sleep(0.001) env.close()
def main(): movement = SIMPLE_MOVEMENT movement.append(['left', 'A']) movement.append(['left', 'B']) movement.append(['left', 'A', 'B']) #movement.append(['B']) #movement.append(['down']) #movement.append(['up']) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, movement) #channels is acting as the number of frames in history #if resize_height and height are different, assert final_height < resize_height and image will be cropped channels = 4 # width = 84 # resize_height = 110 # final_height = 84 width=128 resize_height = 168 final_height = 128 size = [channels, final_height, width] batch_size = 16 replay_capacity = 100000 replay_dir = '/home/hansencb/mario_replay/' gamma = 0.95 start_epsilon = 0.3 stop_epsilon = 0.01 epsilon_decay = 0.00025 use_cuda = torch.cuda.is_available() torch.manual_seed(1) device = torch.device("cuda" if use_cuda else "cpu") model = simple_net(channels, len(movement), device).to(device) target_model = simple_net(channels, len(movement), device).to(device) data_file = 'data_loader' model_file = 'mario_agent' continue_train = True model.load_state_dict(torch.load(model_file)) if continue_train: target_model.load_state_dict(torch.load(model_file)) lr = 0.00005 optimizer = torch.optim.Adam(model.parameters(), lr=lr) total_reward_file ='total_reward.txt' if not continue_train: with open(total_reward_file, 'w') as f: f.write('Reward\tSteps\n') max_steps = 5000 num_eps = 5000 if continue_train: with open(data_file, 'rb') as f: data = pickle.load(f) data.batch_size = batch_size else: data = dataset(replay_capacity, batch_size, replay_dir, size) #initialize memory with 100 experiences done = True for i in range(100): if done: state = env.reset() state = preprocess(state, [resize_height, width], final_height) state = torch.cat((state, state, state, state)) action = random.randint(0,len(movement)-1) next_state, reward, done, info = env.step(int(action)) # if reward>0: # reward = 1 # else: # reward = -1 reward /= 15 if reward == 0: reward = -0.1 next_state = preprocess(next_state, [resize_height, width], final_height) next_state = torch.cat((state[1:, :, :], next_state)) trans = transition(state, action, reward, next_state, done) data.add(trans) state = next_state tau = 0 max_tau = 2000 decay_step = 0 farthest = 3000 cur_x = 1 #training loop for episode in range(num_eps): print('Episode {}'.format(episode+1)) state = env.reset() state = preprocess(state, [resize_height, width], final_height) state = torch.cat((state, state, state, state)) action = 0 episode_reward = 0 for step in range(max_steps): tau += 1 #epsilon = stop_epsilon+(start_epsilon - stop_epsilon)*np.exp(-epsilon_decay*decay_step) epsilon = start_epsilon * np.exp(1-(1/(cur_x/farthest))) if epsilon < stop_epsilon: epsilon = stop_epsilon if random.random() < epsilon: action = random.randint(0,len(movement)-1) else: q_val, action, q_vals = maxQ(state, model, device) next_state, reward, done, info = env.step(int(action)) cur_x = info['x_pos'] if cur_x > farthest: farthest = cur_x # if reward > 0: # reward = 1 # else: # reward = -1 reward /= 15 if reward == 0: reward = -0.1 episode_reward += reward next_state = preprocess(next_state, [resize_height, width], final_height) next_state = torch.cat((state[1:,:,:], next_state)) trans = transition(state, action, reward, next_state, done) data.add(trans) batch = data.get_batch(model, target_model, device, gamma) loss, abs_err = train(model, device, optimizer, batch) data.update_batch(batch['idx'], np.squeeze(torch.Tensor.numpy(abs_err))) state = next_state env.render() #time.sleep(0.03) if tau > max_tau: target_model.load_state_dict(model.state_dict()) tau = 0 if done: break decay_step += step with open(total_reward_file, 'a') as f: f.write('{}\t{}\n'.format(episode_reward, step)) if episode % 5 == 0: with open(model_file, 'wb') as f: torch.save(model.state_dict(), f) with open(data_file, 'wb') as f: pickle.dump(data, f) env.close()
def main(): movement = SIMPLE_MOVEMENT movement.append(['left', 'A']) movement.append(['left', 'B']) movement.append(['left', 'A', 'B']) movement.append(['B']) movement.append(['down']) movement.append(['up']) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, movement) #channels is acting as the number of frames in history #if resize_height and height are different, assert final_height < resize_height and image will be cropped channels = 4 width = 84 resize_height = 110 final_height = 84 size = [channels, final_height, width] batch_size = 32 replay_capacity = 100000 replay_dir = '/home/hansencb/mario_replay/' epsilon = 1 gamma = 0.9 use_cuda = torch.cuda.is_available() torch.manual_seed(1) device = torch.device("cuda" if use_cuda else "cpu") model = simple_net(channels, len(movement), device).to(device) target_model = simple_net(channels, len(movement), device).to(device) model_file = 'mario_agent' model.load_state_dict(torch.load(model_file)) target_model.load_state_dict(torch.load(model_file)) lr = 0.001 optimizer = torch.optim.Adam(model.parameters(), lr=lr) total_reward_file = 'total_reward.txt' with open(total_reward_file, 'w') as f: f.write('Reward\tSteps\n') max_steps = 5000 num_eps = 1000 data = dataset(replay_capacity, batch_size, replay_dir, 1, size) for episode in range(num_eps): print('Episode {}'.format(episode + 1)) state = env.reset() state = preprocess(state, [resize_height, width], final_height) state = torch.cat((state, state, state, state)) action = 0 episode_reward = 0 for step in range(max_steps): if step % 3 == 0: if random.random() < epsilon: action = random.randint(0, len(movement) - 1) else: q_val, action = maxQ(state, model, device) next_state, reward, done, info = env.step(int(action)) if reward > 0: reward = 1 else: reward = -1 episode_reward += reward next_state = preprocess(next_state, [resize_height, width], final_height) next_state = torch.cat((state[1:, :, :], next_state)) trans = transition(state, action, reward, next_state, done) data.add(trans) train(model, device, optimizer, data.get_batch(model, device, gamma)) state = next_state env.render() #time.sleep(0.03) if done: with open(total_reward_file, 'a') as f: f.write('{}\t{}\n'.format(episode_reward, step)) break epsilon -= (1 / num_eps) if episode % 10 == 0: target_model.load_state_dict(model.state_dict()) with open(model_file, 'wb') as f: torch.save(model.state_dict(), f) env.close()
kernel_size=(5, 5), strides=(1, 1), activation='relu', input_shape=image_shape)) model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2))) model.add(Conv2D(64, (5, 5), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(128, activation='relu')) model.add(Flatten()) model.add(Dense(128, activation='relu')) model.add(Flatten()) model.add(Dense(7, activation='linear')) model.compile(loss='mse', optimizer=Adam(clipnorm=10, lr=1e-4, decay=1e-6, epsilon=1e-4)) """ done = True for step in range(5000): if done: state = env.reset() state, reward, done, info = env.step(env.action_space.sample()) print(env.action_space.samp) env.close() """ episodes = 1000 gamma = 0.99 epsilon = 0.1 reward_count = 0 reward_history = [] max_episode_reward = 0
class Agent: def __init__(self, level_name): self.level_name = level_name # setup environment self.env = gym_super_mario_bros.make(level_name) self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT) # one hot encoded version of our actions self.possible_actions = np.array( np.identity(self.env.action_space.n, dtype=int).tolist()) # resest graph tf.reset_default_graph() # instantiate the DQNetwork self.DQNetwork = DQNetwork(state_size, action_size, learning_rate) # instantiate memory self.memory = Memory(max_size=memory_size) # initialize deque with zero images self.stacked_frames = deque( [np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4) for i in range(pretrain_length): # If it's the first step if i == 0: state = self.env.reset() state, self.stacked_frames = stack_frames( self.stacked_frames, state, True) # Get next state, the rewards, done by taking a random action choice = random.randint(1, len(self.possible_actions)) - 1 action = self.possible_actions[choice] next_state, reward, done, _ = self.env.step(choice) # stack the frames next_state, self.stacked_frames = stack_frames( self.stacked_frames, next_state, False) # if the episode is finished (we're dead) if done: # we inished the episode next_state = np.zeros(state.shape) # add experience to memory self.memory.add((state, action, reward, next_state, done)) # start a new episode state = self.env.reset() state, self.stacked_frames = stack_frames( self.stacked_frames, state, True) else: # add experience to memory self.memory.add((state, action, reward, next_state, done)) # our new state is now the next_state state = next_state # saver will help us save our model self.saver = tf.train.Saver() # setup tensorboard writer self.writer = tf.summary.FileWriter("logs/") # losses tf.summary.scalar("Loss", self.DQNetwork.loss) self.write_op = tf.summary.merge_all() def predict_action(self, sess, explore_start, explore_stop, decay_rate, decay_step, state, actions): # first we randomize a number exp_exp_tradeoff = np.random.rand() explore_probability = explore_stop + ( explore_start - explore_stop) * np.exp(-decay_rate * decay_step) if explore_probability > exp_exp_tradeoff: # make a random action choice = random.randint(1, len(self.possible_actions)) - 1 action = self.possible_actions[choice] else: # estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict={ self.DQNetwork.inputs_: state.reshape((1, *state.shape)) }) # take the biggest Q value (= best action) choice = np.argmax(Qs) action = self.possible_actions[choice] return action, choice, explore_probability def play_notebook(self): import matplotlib.pyplot as plt # imports to render env to gif from JSAnimation.IPython_display import display_animation from matplotlib import animation from IPython.display import display # http://mckinziebrandon.me/TensorflowNotebooks/2016/12/21/openai.html def display_frames_as_gif(frames): """ Displays a list of frames as a gif, with controls """ #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) display(display_animation(anim, default_mode='loop')) frames = [] with tf.Session() as sess: total_test_rewards = [] # Load the model self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name)) for episode in range(1): total_rewards = 0 state = self.env.reset() state, self.stacked_frames = stack_frames( self.stacked_frames, state, True) print("****************************************************") print("EPISODE ", episode) while True: # Reshape the state state = state.reshape((1, *state_size)) # Get action from Q-network # Estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: state}) # Take the biggest Q value (= the best action) choice = np.argmax(Qs) #Perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) frames.append(self.env.render(mode='rgb_array')) total_rewards += reward if done: print("Score", total_rewards) total_test_rewards.append(total_rewards) break next_state, self.stacked_frames = stack_frames( self.stacked_frames, next_state, False) state = next_state self.env.close() display_frames_as_gif(frames) def play(self): with tf.Session() as sess: total_test_rewards = [] # Load the model self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name)) for episode in range(1): total_rewards = 0 state = self.env.reset() state, self.stacked_frames = stack_frames( self.stacked_frames, state, True) print("****************************************************") print("EPISODE ", episode) while True: # Reshape the state state = state.reshape((1, *state_size)) # Get action from Q-network # Estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: state}) # Take the biggest Q value (= the best action) choice = np.argmax(Qs) #Perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) self.env.render() total_rewards += reward if done: print("Score", total_rewards) total_test_rewards.append(total_rewards) break next_state, self.stacked_frames = stack_frames( self.stacked_frames, next_state, False) state = next_state self.env.close() def train(self): with tf.Session() as sess: # initialize the variables sess.run(tf.global_variables_initializer()) # initialize decay rate (that will be used to reduce epsilon) decay_step = 0 for episode in range(total_episodes): # set step to 0 step = 0 # initialize rewards of episode episode_rewards = [] # make a new episode and opserve the first state state = self.env.reset() # remember that stack frame function state, self.stacked_frames = stack_frames( self.stacked_frames, state, True) print("Episode:", episode) while step < max_steps: step += 1 #print("step:", step) # increase decay_step decay_step += 1 # predict an action action, choice, explore_probability = self.predict_action( sess, explore_start, explore_stop, decay_rate, decay_step, state, self.possible_actions) # perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) if episode_render: self.env.render() # add the reward to total reward episode_rewards.append(reward) # the game is finished if done: print("done") # the episode ends so no next state next_state = np.zeros((110, 84), dtype=np.int) next_state, self.stacked_frames = stack_frames( self.stacked_frames, next_state, False) # set step = max_steps to end episode step = max_steps # get total reward of the episode total_reward = np.sum(episode_rewards) print("Episode:", episode, "Total reward:", total_reward, "Explore P:", explore_probability, "Training Loss:", loss) #rewards_list.append((episode, total_reward)) # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory self.memory.add( (state, action, reward, next_state, done)) else: # stack frame of the next state next_state, self.stacked_frames = stack_frames( self.stacked_frames, next_state, False) # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory self.memory.add( (state, action, reward, next_state, done)) # s_{i} := s_{i+1} state = next_state ### Learning part # obtain random mini-batch from memory batch = self.memory.sample(batch_size) states_mb = np.array([each[0] for each in batch], ndmin=3) actions_mb = np.array([each[1] for each in batch]) rewards_mb = np.array([each[2] for each in batch]) next_states_mb = np.array([each[3] for each in batch], ndmin=3) dones_mb = np.array([each[4] for each in batch]) target_Qs_batch = [] # get Q values for next_state Qs_next_state = sess.run( self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: next_states_mb}) # set Q_target = r if episode ends with s+1 for i in range(len(batch)): terminal = dones_mb[i] # if we are in a terminal state, only equals reward if terminal: target_Qs_batch.append(rewards_mb[i]) else: target = rewards_mb[i] + gamma * np.max( Qs_next_state[i]) target_Qs_batch.append(target) targets_mb = np.array([each for each in target_Qs_batch]) loss, _ = sess.run( [self.DQNetwork.loss, self.DQNetwork.optimizer], feed_dict={ self.DQNetwork.inputs_: states_mb, self.DQNetwork.target_Q: targets_mb, self.DQNetwork.actions_: actions_mb }) # write tf summaries summary = sess.run(self.write_op, feed_dict={ self.DQNetwork.inputs_: states_mb, self.DQNetwork.target_Q: targets_mb, self.DQNetwork.actions_: actions_mb }) self.writer.add_summary(summary, episode) self.writer.flush() # save model every 5 episodes if episode % 5 == 0: self.saver.save(sess, "models/{0}.cpkt".format(self.level_name)) print("Model Saved")
class MarioEnvironment(AbstractEnvironment): """ Standard Super Mario Bros Environment https://github.com/Kautenja/gym-super-mario-bros """ def __init__(self, game_name, task_name, action_mode=SIMPLE_MOVEMENT, state_size=None): """ Args: game_name : string game_name = name of the game (e.g. SuperMarioBros-5-1-v0) task_name : string task_name = name of the task state_size : list or tuple or None state_size = size of state, [h, w] or [h, w, c] """ self.game_name = game_name self.task_name = task_name self.action_mode = action_mode self.env = gym_super_mario_bros.make(game_name) self.env = BinarySpaceToDiscreteSpaceEnv(self.env, self.action_mode) self.n_action = self.env.action_space.n self.actions = [a for a in range(self.n_action)] self.new_episode() def get_state(self, setting=None): """ Get Current State Args: setting : dictionary setting = setting for states 'resolution' : list or tuple or None 'resolution' = resolution of states, [h, w, c] or [h, w] Returns: state : numpy.ndarray state = current screen, shape [h, w, c], values locate at [0, 1] """ if (setting is None or ('resolution' not in setting.keys())): resolution = self.state_size else: resolution = setting['resolution'] normalized = False if (len(resolution) == 3 and resolution[2] == 1): state = rgb2grey(self.ob) normalized = True else: state = self.ob if (state.ndim == 2): state = np.expand_dims(state, axis=-1) assert (state.ndim == 3), 'shape of screen should be [h, w, c]' state = resize(state, resolution[:2], preserve_range=True) state = state.astype(np.float) if (not normalized): state /= 255. return state def apply_action(self, action, num_repeat): """ Apply Actions To The Environment And Get Reward Args: action : int action = applied action num_repeat : int num_repeat = number of repeated actions Returns: reward : float reward = reward of last action """ assert (not self.done), 'The episode is done' reward = 0 for _ in range(num_repeat): self.ob, reward, self.done, _ = self.env.step(action) self.score += reward if (self.done): break reward = reward_reshape(reward, self.game_name, self.task_name) return reward def new_episode(self): """ Start A New Episode """ self.ob = self.env.reset() self.done = False self.score = 0 def episode_end(self): """ Check If The Episode Ends Returns: ep_end : bool ep_end = when the episode finishes, return True """ return self.done def action_set(self): """ Get Actions Set Returns: actions : list actions = list of actions """ return self.actions def available_action(self): """ Get Indices of Available Actions For Current State Returns: available_ind : list available_ind = indices of available action """ return range(self.actions) def episode_total_score(self): """ Get Total Score For Last Episode """ return self.score def close(self): """ Close The Environment """ self.env.close() return True
def main(): movement = SIMPLE_MOVEMENT movement.append(['left', 'A']) movement.append(['left', 'B']) movement.append(['left', 'A', 'B']) #movement.append(['B']) #movement.append(['down']) #movement.append(['up']) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, movement) #channels is acting as the number of frames in history #if resize_height and height are different, assert final_height < resize_height and image will be cropped channels = 3 frames = 4 width = 128 resize_height = 180 final_height = 128 bottom_chop = 15 size = [channels * frames, final_height, width] batch_size = 16 replay_capacity = 100000 replay_dir = '/home-local/bayrakrg/mario_replay/' start_epsilon = 1.0 stop_epsilon = 0.01 epsilon_decay = 0.00005 gamma = 0.75 use_cuda = torch.cuda.is_available() torch.manual_seed(1) device = torch.device("cuda" if use_cuda else "cpu") model = simple_net(channels, len(movement), device).to(device) target_model = simple_net(channels, len(movement), device).to(device) model_file = 'mario_agent' model.load_state_dict(torch.load(model_file)) target_model.load_state_dict(torch.load(model_file)) lr = 0.0001 optimizer = torch.optim.Adam(model.parameters(), lr=lr) total_reward_file = 'total_reward.txt' with open(total_reward_file, 'w') as f: f.write('Reward\tSteps\n') max_steps = 500 num_eps = 10000 data = dataset(replay_capacity, batch_size, replay_dir, 1, size) tau = 0 max_tau = 10000 decay_step = 0 for episode in range(num_eps): print('Episode {}'.format(episode + 1)) state = env.reset() state = preprocess(state, [resize_height, width, 3], final_height, bottom_chop) state = torch.cat((state, state, state, state)) action = 0 episode_reward = 0 for step in range(max_steps): tau += 1 decay_step += 1 epsilon = stop_epsilon + (start_epsilon - stop_epsilon) * np.exp( -epsilon_decay * decay_step) if random.random() < epsilon: action = random.randint(0, len(movement) - 1) else: q_val, action, q_vals = maxQ(state, model, device) next_state, reward, done, info = env.step(int(action)) if step == max_steps - 1: reward -= 10 if reward > 0: reward = 1 else: reward = -1 episode_reward += reward next_state = preprocess(next_state, [resize_height, width, 3], final_height, bottom_chop) next_state = torch.cat((state[3:, :, :], next_state)) trans = transition(state, action, reward, next_state, done) data.add(trans) train(model, device, optimizer, data.get_batch(model, target_model, device, gamma)) state = next_state env.render() if tau > max_tau: target_model.load_state_dict(model.state_dict()) tau = 0 if done: break with open(total_reward_file, 'a') as f: f.write('{}\t{}\n'.format(episode_reward, step)) if episode % 5 == 0: with open(model_file, 'wb') as f: torch.save(model.state_dict(), f) env.close()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) #parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--dueling', type=int, default=0) #parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-path', type=str, default='/.') args = parser.parse_args() # TODO change logging dir for tensorboard #logger.configure(dir=None, format_strs='stdout,log,csv,json,tensorboard') #logger.configure(dir=None, format_strs=['stdout', 'log', 'csv', 'json', 'tensorboard']) timestart = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M:%S') logger.configure( dir=PROJ_DIR + "/../tensorboard/" + str(timestart), format_strs=['stdout', 'log', 'csv', 'json', 'tensorboard']) logger.set_level(logger.INFO) set_global_seeds(args.seed) env = gym_super_mario_bros.make('SuperMarioBros-v1') #wrap environment env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) #record videos of an episode env = VideoRecorderWrapper(env, PROJ_DIR + "/../video", str(timestart), 50) #the agent has only one trial env = EpisodicLifeEnv(env) # nes_py #preprocess the input frame env = DownsampleEnv(env, (84, 84)) #set death penalty env = PenalizeDeathEnv(env, penalty=-25) #Stack 4 Framse as input env = FrameStackEnv(env, 4) #print tensorboard log information print("logger.get_dir():", logger.get_dir()) print("PROJ_DIR:", PROJ_DIR) act = None #enable output in the terminal env = bench.Monitor(env, logger.get_dir()) modelname = datetime.datetime.now().isoformat() #define callback function for the training process def render_callback(lcl, _glb): # print(lcl['episode_rewards']) total_steps = lcl['env'].total_steps #if total_steps % 2000 == 0: env.render() # pass #different models with different parameters. out commented #CNN built deepq.models.with cnn_to_mlp(params) #trained with deepq.learn(params) #2018-08-12-10:25:50 model 4, 100k, lr 0.0005, alpha 0.6, gamma 0.99, 8 frames v1 #2018-08-12-11:31:59 model 4, 100k, lr 0.0005, alpha 0.8, gamma 0.99, 6 frames v1 # model 04 # nature human paper + Improvements # Dueling Double DQN, Prioritized Experience Replay, and fixed Q-targets model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # (num_outputs, kernel_size, stride) hiddens=[512], # 512 dueling=bool(1), ) act = deepq.learn( env, q_func=model, lr=0.0001, # 0.00025 1e-4 max_timesteps=int(100000), # 100k -> 3h buffer_size=50000, # 5000, #10000 exploration_fraction=0.3, # 0.1, exploration_final_eps=0.1, # 0.01 train_freq=4, # 4 learning_starts=25000, # 10000 target_network_update_freq=1000, gamma=0.5, #0.99, prioritized_replay=bool(1), prioritized_replay_alpha=0.2, checkpoint_freq=args.checkpoint_freq, # checkpoint_path=args.checkpoint_path, callback=render_callback, print_freq=1) print("Saving model to mario_model.pkl " + timestart) act.save("../models/mario_model_{}.pkl".format(timestart)) env.close()