def Environment(): env = gym_super_mario_bros.make(ENV_NAME) env = JoypadSpace(env, COMPLEX_MOVEMENT) env = Reward(env) env = SkipFrame(env) return env
import gym_super_mario_bros from random import random, randrange from gym_super_mario_bros.actions import RIGHT_ONLY from nes_py.wrappers import JoypadSpace from gym import wrappers env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, RIGHT_ONLY) env = wrappers.Monitor(env, 'demo', force=True) # Play randomly done = False env.reset() step = 0 while not done: action = randrange(len(RIGHT_ONLY)) state, reward, done, info = env.step(action) if step > 400: env.close() print(done, step, info) env.render() step += 1 env.close()
class Env(object): def __init__(self, act_space, act_repeats, frames, game): self.act_space = act_space self.act_repeats = act_repeats self.act_repeat = random.choice(self.act_repeats) self.frames = frames self.max_pos = -10000 self.count = 0 env = gym_super_mario_bros.make(game) if self.act_space == 7: self.env = JoypadSpace(env, SIMPLE_MOVEMENT) elif self.act_space == 12: self.env = JoypadSpace(env, COMPLEX_MOVEMENT) s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, frames]) self.s = [self.s_t] self.a_t = random.randint(0, act_space - 1) self.a = [self.a_t] self.a_logits = [] self.r = [] self.pos = [] self.v_cur = [] c_in = np.zeros(256, dtype=np.float32) h_in = np.zeros(256, dtype=np.float32) state_in = np.concatenate([c_in, h_in], axis=-1) self.state_in = [state_in] self.done = False def step(self, a, a_logits, state_in): self.count += 1 if self.count % self.act_repeat == 0: self.a_t = a self.count = 0 self.act_repeat = random.choice(self.act_repeats) gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t) self.env.render() if not gdone: s_t1, r_t, done, info = self.env.step(self.a_t) r_t += gr_t r_t /= 2. else: s_t1 = gs_t1 r_t = gr_t done = gdone info = ginfo r_t /= 15. s_t1 = self.resize_image(s_t1) channels = s_t1.shape[-1] self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1) self.s.append(self.s_t) self.a.append(self.a_t) self.a_logits.append(a_logits) self.r.append(r_t) self.max_pos = max(self.max_pos, info["x_pos"]) self.pos.append(info["x_pos"]) if (len(self.pos) > 500) and ( info["x_pos"] - self.pos[-500] < 5) and ( self.pos[-500] - info["x_pos"] < 5): done = True self.done = done self.state_in.append(state_in) def update_v(self, v_cur): self.v_cur.append(v_cur) def reset(self, force=False): if self.done or force: self.count = 0 self.act_repeat = random.choice(self.act_repeats) s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, self.frames]) self.s = [self.s_t] self.a_t = random.randint(0, self.act_space - 1) self.a = [self.a_t] self.a_logits = [] self.r = [] self.pos = [] self.v_cur = [] c_in = np.zeros(256, dtype=np.float32) h_in = np.zeros(256, dtype=np.float32) state_in = np.concatenate([c_in, h_in], axis=-1) self.state_in = [state_in] self.done = False def get_state(self): return self.s_t def get_act(self): return self.a_t def get_max_pos(self): return self.max_pos def reset_max_pos(self): self.max_pos = -10000 def get_state_in(self): return self.state_in[-1] def get_history(self, force=False): if self.done or force: if self.done: seg = Seg(self.s, self.a, self.a_logits, self.r, self.v_cur, self.state_in) return seg if force and len(self.r) > 1: seg = Seg(self.s[:-1], self.a[:-1], self.a_logits[:-1], self.r[:-1], self.v_cur[:-1], self.state_in[:-1]) return seg return None @staticmethod def resize_image(image, size=84): image = Image.fromarray(image) image = image.convert("L") image = image.resize((size, size)) image = np.array(image) image = image / 255. image = np.array(image, np.float32) return image[:, :, None]
from nes_py.wrappers import JoypadSpace import gym_super_mario_bros from numba import cuda #importing environments from gym_super_mario_bros.actions import SIMPLE_MOVEMENT env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) #importing libraries import numpy as np observation = env.reset() r = [] infos = [] MAX_STEPS = 1500 frames = np.zeros((MAX_STEPS, 240, 256, 3), dtype=np.int64) xs = [] valid_actions = [1, 5, 6] for step in range(MAX_STEPS): # Render into buffer. frames[step] = env.render(mode='rgb_array') observation, reward, done, info = env.step( valid_actions[np.random.randint(3)]) # infos.append(info) r.append(reward) if done: break r = np.array(r) def preprocess(frame): frame = frame.sum(axis=-1) / 765 frame = frame[20:210, :] frame = frame[::2, ::2]
import time import numpy as np #from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv from nes_py.wrappers import JoypadSpace import gym_super_mario_bros from gym_super_mario_bros.actions import RIGHT_ONLY from agent import DQNAgent from wrappers import wrapper from utils import get_args # Take argument arg = get_args() # Build env (first level, right only) env = gym_super_mario_bros.make(arg.env) env = JoypadSpace(env, RIGHT_ONLY) env = wrapper(env) # Parameters states = (84, 84, 4) actions = env.action_space.n # Pham xuan # Agent agent = DQNAgent(states=states, actions=actions, max_memory=100000, double_q=True) # Episodes eisodes = 101 rewards = [] # Timing start = time.time()
import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT import matplotlib.pyplot as plt from matplotlib import animation, rc import numpy as np from rl.core import Processor my_action = [ ['right', 'A'], ['right', 'B'], ['right', 'A', 'B'], ] env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = JoypadSpace(env, my_action) nb_actions = 3 window_length = 1 input_shape = (window_length, ) + env.observation_space.shape print(input_shape) # ゲーム環境のリセット env.reset() from keras.models import Sequential from keras.layers import * from keras.initializers import he_normal model = Sequential() print('input_shape' + str(input_shape)) model.add(Flatten(input_shape=input_shape))
class Env(object): def __init__(self, game, **kwargs): self.act_space = kwargs.get("act_space") self.state_size = kwargs.get("state_size") self.burn_in = kwargs.get("burn_in") self.seqlen = kwargs.get("seqlen") self.n_step = kwargs.get("n_step") self.frames = kwargs.get("frames") self.game = game self.count = 0 self.count_maxpos = [] env = gym_super_mario_bros.make(game) if self.act_space == 7: self.env = JoypadSpace(env, SIMPLE_MOVEMENT) elif self.act_space == 12: self.env = JoypadSpace(env, COMPLEX_MOVEMENT) self.max_pos = -10000 self.done = True self.reset() def step(self, a, a_logits, v_cur, state_in): maxpos = self.reset() self.count += 1 self.a_t = a gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t) # self.env.render() if not gdone: s_t1, r_t, done, info = self.env.step(self.a_t) r_t += gr_t r_t /= 2. else: s_t1 = gs_t1 r_t = gr_t done = gdone info = ginfo r_t /= 15.0 s_t1 = self.resize_image(s_t1) channels = s_t1.shape[-1] self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1) # self.s.append(self.s_t) # self.a.append(self.a_t) # self.a_logits.append(a_logits) # self.r.append(r_t) # self.v_cur.append(v_cur) # self.state_in.append(state_in) self.s = [self.s_t] self.a = [self.a_t] self.a_logits = [a_logits] self.r = [r_t] self.v_cur = [v_cur] self.state_in = [state_in] self.max_pos = max(self.max_pos, info["x_pos"]) self.pos.append(info["x_pos"]) # if (len(self.pos) > 100) and ( # info["x_pos"] - self.pos[-100] < 5) and ( # self.pos[-100] - info["x_pos"] < 5): # done = True self.done = done if self.done: self.mask.append(0) else: self.mask.append(1) """ get segs """ # segs = self.get_history() # # return segs return maxpos def reset(self): if self.done: self.count_maxpos.append(self.max_pos) print(self.game, self.max_pos, len(self.count_maxpos[1:]), np.mean(self.count_maxpos[1:])) self.count = 0 s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, self.frames]) self.s = [self.s_t] self.a_t = random.randint(0, self.act_space - 1) self.a = [self.a_t] self.a_logits = [] self.r = [0] self.v_cur = [] self.mask = [1] self.max_pos = -10000 self.pos = [] state_in = np.zeros(self.state_size, dtype=np.float32) self.state_in = [state_in] self.done = False return self.count_maxpos return None def get_state(self): return self.s_t def get_act(self): return self.a_t def get_reward(self): return self.r[-1] def get_max_pos(self): return self.max_pos def get_state_in(self): return self.state_in[-1] @staticmethod def resize_image(image, size=84): image = Image.fromarray(image) image = image.convert("L") image = image.resize((size, size)) image = np.array(image, np.uint8) return image[:, :, None]
def main(): env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) agent = DQNAgent(action_size=7) scores, episodes, global_step = [], [], 0 global_start = datetime.now() local_start = datetime.now() print() print("=" * 100) print("RL environment initialized") print("=" * 100) print() for e in range(4): e = e + 1 done = False dead = False step, score, start_life = 0, 0, 5 observe = env.reset() for _ in range(random.randint(1, agent.no_op_steps)): observe, _, _, _ = env.step(1) state = agent.pre_processing(observe) history = np.stack((state, state, state, state), axis=2) history = np.reshape([history], (1, 240, 256, 4)) count_epsilon = 0 count_greedy = 0 while not done: # if agent.render: env.render() global_step += 1 step += 1 # 바로 전 4개의 상태로 행동을 선택 action, res = agent.get_action(history) if res: count_epsilon += 1 else: count_greedy += 1 # 선택한 행동으로 환경에서 한 타임스텝 진행 observe, reward, done, info = env.step(action) # print(info) # 각 타임스텝마다 상태 전처리 next_state = agent.pre_processing(observe) next_state = np.reshape([next_state], (1, 240, 256, 1)) next_history = np.append(next_state, history[:, :, :, :3], axis=3) agent.avg_q_max += np.amax( agent.model.predict(np.float32(history / 255.))[0]) real_reward = reward score += real_reward if dead: dead = False else: history = next_history if global_step == 0: pass elif global_step % 1000 == 0: print("local step : {}, time : {} sec, epsilon : {}".format( global_step, (datetime.now() - local_start).seconds, agent.epsilon)) local_start = datetime.now() if done: print( "episode : {}, score : {}, step : {}, avg q : {}, avg loss : {}" .format(e, score, agent.epsilon, global_step, agent.avg_q_max / float(step), agent.avg_loss / float(step))) print("epsilon : {}, greedy : {}".format( count_epsilon, count_greedy)) print() # if e < 2: # pass # else: print("time elapsed : {} sec".format( (datetime.now() - global_start).seconds)) global_start = datetime.now() print() print() agent.avg_q_max, agent.avg_loss, global_step = 0, 0, 0
def load_model(self, name): self.actor.load_weights(name) def pre_processing(next_observe, observe): processed_observe = np.maximum(next_observe, observe) processed_observe = np.uint8( resize(rgb2gray(processed_observe), (240, 256), mode='constant') * 255) return processed_observe if __name__ == "__main__": # env = gym.make(env_name) env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) agent = TestAgent(action_size=7) agent.load_model("a3c_actor.h5") step = 0 while episode < EPISODES: done = False dead = False score, start_life = 0, 5 observe = env.reset() next_observe = observe # for _ in range(random.randint(1, 20)): # observe = next_observe
def evaluate_folder(root_dir, file_name_list, folder_index, param_list, alpha, model_name_list, num_evals): # print(model_name_list) # print(os.path.join(root_dir, file_name_list[folder_index], param_list[folder_index])) json_file_name = os.path.join(root_dir, param_list[folder_index]) json_file = open(json_file_name) json_str = json_file.read() hyperparam = json.loads(json_str) num_episodes = hyperparam['num_episodes'] eval_cycle = hyperparam['eval_cycle'] num_eval_episodes = hyperparam['num_eval_episodes'] train_every_n_steps = hyperparam['train_every_n_steps'] train_n_times = hyperparam['train_n_times'] batch_size = hyperparam['batch_size'] learning_rate = hyperparam['learning_rate'] capacity = hyperparam['capacity'] gamma = hyperparam['gamma'] epsilon = hyperparam['epsilon'] tau = hyperparam['tau'] soft_update = hyperparam['soft_update'] history_length = hyperparam['history_length'] skip_frames = hyperparam['skip_frames'] ddqn = hyperparam['ddqn'] model = hyperparam['model'] environment = hyperparam['environment'] map = hyperparam['map'] activation = hyperparam['activation'] render_training = hyperparam['render_training'] max_timesteps = hyperparam['max_timesteps'] normalize_images = hyperparam['normalize_images'] non_uniform_sampling = hyperparam['non_uniform_sampling'] n_step_reward = hyperparam['n_step_reward'] mu_intrinsic = hyperparam['mu_intrinsic'] beta_intrinsic = hyperparam['beta_intrinsic'] lambda_intrinsic = hyperparam['lambda_intrinsic'] intrinsic = hyperparam['intrinsic'] residual_icm_forward = hyperparam['residual_icm_forward'] use_history_in_icm = hyperparam['use_history_in_icm'] extrinsic = hyperparam['extrinsic'] update_q_target = hyperparam['update_q_target'] epsilon_schedule = hyperparam['epsilon_schedule'] epsilon_start = hyperparam['epsilon_start'] epsilon_end = hyperparam['epsilon_end'] epsilon_decay = hyperparam['epsilon_decay'] virtual_display = hyperparam['virtual_display'] seed = hyperparam['seed'] pre_intrinsic = hyperparam['pre_intrinsic'] experience_replay = hyperparam['experience_replay'] prio_er_alpha = hyperparam['prio_er_alpha'] prio_er_beta_start = hyperparam['prio_er_beta_start'] prio_er_beta_end = hyperparam['prio_er_beta_end'] prio_er_beta_decay = hyperparam['prio_er_beta_decay'] init_prio = hyperparam['init_prio'] fixed_encoder = hyperparam['fixed_encoder'] duelling = hyperparam['duelling'] iqn = hyperparam['iqn'] iqn_n = hyperparam['iqn_n'] iqn_np = hyperparam['iqn_np'] iqn_k = hyperparam['iqn_k'] iqn_tau_embed_dim = hyperparam['iqn_tau_embed_dim'] iqn_det_max_train = hyperparam['iqn_det_max_train'] iqn_det_max_act = hyperparam['iqn_det_max_act'] huber_kappa = hyperparam['huber_kappa'] state_height = hyperparam['state_height'] state_width = hyperparam['state_width'] number_model_files = hyperparam['number_model_files'] simple_coverage_threshold = hyperparam['simple_coverage_threshold'] geometric_coverage_gamma = hyperparam['geometric_coverage_gamma'] num_total_steps = hyperparam['num_total_steps'] store_cycle = hyperparam['store_cycle'] adam_epsilon = hyperparam['adam_epsilon'] gradient_clip = hyperparam.get('gradient_clip', False) # Set seed torch.manual_seed(seed) # Create experiment directory with run configuration args_for_filename = [ 'environment', 'map', 'extrinsic', 'intrinsic', 'fixed_encoder', 'ddqn', 'duelling', 'iqn', 'experience_replay', 'soft_update', 'n_step_reward', 'seed' ] if environment == envs[0]: from vizdoom_env.vizdoom_env import DoomEnv env = DoomEnv(map_name=map, render=render_training) writer = setup_experiment_folder_writer( inspect.currentframe(), name='Vizdoom', log_dir='vizdoom_eval', args_for_filename=args_for_filename, additional_param=hyperparam) # placeholder for non uniform action probabilities. change to something sensible if wanted. nu_action_probs = np.ones(env.action_space.n, dtype=np.float32) / env.action_space.n else: if virtual_display: if render_training: print( """On the tfpool computers this will probably not work together. Better deactivate render_training when using the virtual display.""" ) from pyvirtualdisplay import Display display = Display(visible=0, size=(224, 240)) display.start() if environment == envs[1]: from nes_py.wrappers import JoypadSpace import gym_super_mario_bros from gym_super_mario_bros.actions import COMPLEX_MOVEMENT # env = retro.make(game='SuperMarioBros-Nes') env = gym_super_mario_bros.make('SuperMarioBros-v0').unwrapped env = JoypadSpace(env, COMPLEX_MOVEMENT) writer = setup_experiment_folder_writer( inspect.currentframe(), name='Mario', log_dir='mario', args_for_filename=args_for_filename) nu_action_probs = np.ones(env.action_space.n, dtype=np.float32) / env.action_space.n elif environment == envs[2]: import gym_minigrid from src.train_gridworld import ClassicalGridworldWrapper grid_size = 100 env = gym_minigrid.envs.EmptyEnv(size=grid_size) env = ClassicalGridworldWrapper(env) writer = setup_experiment_folder_writer( inspect.currentframe(), name='GridWorld', log_dir='gridworld', args_for_filename=args_for_filename) nu_action_probs = np.ones(env.action_space.n, dtype=np.float32) / env.action_space.n elif environment == envs[3]: import gym env = gym.make('Pong-v0') writer = setup_experiment_folder_writer( inspect.currentframe(), name='Pong', log_dir='pong', args_for_filename=args_for_filename) nu_action_probs = np.ones(env.action_space.n, dtype=np.float32) / env.action_space.n else: raise NotImplementedError() num_actions = env.action_space.n channels = 1 # greyscale images state_dim = (channels, state_height, state_width ) # not taking history_length into account. handled later. # Define Q network, target network and DQN agent if model == 'Resnet': CNN = ResnetVariant elif model == 'Lenet': CNN = LeNetVariant elif model == 'DeepQNetwork': CNN = DeepQNetwork else: raise ValueError('{} not implemented'.format(model)) activation = { 'ReLU': torch.nn.ReLU, 'ELU': torch.nn.ELU, 'LeakyReLU': torch.nn.LeakyReLU }[activation] Q_net = CNN(in_dim=state_dim, num_actions=num_actions, history_length=history_length, duelling=duelling, iqn=iqn, activation=activation, embedding_dim=iqn_tau_embed_dim).to(device) Q_target_net = CNN(in_dim=state_dim, num_actions=num_actions, history_length=history_length, duelling=duelling, iqn=iqn, activation=activation, embedding_dim=iqn_tau_embed_dim).to(device) state_encoder = Encoder(in_dim=state_dim, history_length=history_length, use_history=use_history_in_icm).to(device) # push a dummy input through state_encoder to get output dimension which is needed to build dynamics models. tmp_inp = torch.zeros(size=(1, channels * (history_length if use_history_in_icm else 1), state_height, state_width)) tmp_out = state_encoder(tmp_inp.to(device)) inverse_dynamics_model = InverseModel(num_actions=num_actions, input_dim=2 * tmp_out.shape[1]).to(device) forward_dynamics_model = ForwardModel( num_actions=num_actions, state_dim=tmp_out.shape[1]).to(device) intrinsic_reward_network = IntrinsicRewardGenerator( state_encoder=state_encoder, inverse_dynamics_model=inverse_dynamics_model, forward_dynamics_model=forward_dynamics_model, num_actions=num_actions, fixed_encoder=fixed_encoder, residual_forward=residual_icm_forward, use_history=use_history_in_icm) agent = DQNAgent(Q=Q_net, Q_target=Q_target_net, intrinsic_reward_generator=intrinsic_reward_network, num_actions=num_actions, gamma=gamma, batch_size=batch_size, tau=tau, epsilon=epsilon, capacity=capacity, train_every_n_steps=train_every_n_steps, history_length=history_length, soft_update=soft_update, ddqn=ddqn, n_step_reward=n_step_reward, train_n_times=train_n_times, non_uniform_sampling=non_uniform_sampling, epsilon_schedule=epsilon_schedule, mu=mu_intrinsic, beta=beta_intrinsic, update_q_target=update_q_target, lambda_intrinsic=lambda_intrinsic, intrinsic=intrinsic, epsilon_start=epsilon_start, epsilon_end=epsilon_end, lr=learning_rate, epsilon_decay=epsilon_decay, extrinsic=extrinsic, pre_intrinsic=pre_intrinsic, experience_replay=experience_replay, prio_er_alpha=prio_er_alpha, huber_kappa=huber_kappa, prio_er_beta_start=prio_er_beta_start, prio_er_beta_end=prio_er_beta_end, init_prio=init_prio, prio_er_beta_decay=prio_er_beta_decay, state_dim=state_dim, iqn=iqn, iqn_n=iqn_n, iqn_np=iqn_np, iqn_k=iqn_k, iqn_det_max_train=iqn_det_max_train, iqn_det_max_act=iqn_det_max_act, nu_action_probs=nu_action_probs, adam_epsilon=adam_epsilon, gradient_clip=gradient_clip) eval_offline(env=env, agent=agent, writer=writer, num_episodes=num_episodes, eval_cycle=eval_cycle, num_eval_episodes=num_eval_episodes, soft_update=soft_update, skip_frames=skip_frames, history_length=history_length, rendering=render_training, max_timesteps=max_timesteps, normalize_images=normalize_images, state_dim=state_dim, init_prio=init_prio, num_model_files=number_model_files, simple_coverage_threshold=simple_coverage_threshold, geometric_coverage_gamma=geometric_coverage_gamma, num_total_steps=num_total_steps, store_cycle=store_cycle, model_name_list=model_name_list[folder_index], alpha=alpha, num_evals=num_evals, path_of_run=root_dir) writer.close()
import numpy as np import time import random from tqdm import tqdm import cv2 import matplotlib.pyplot as plt from nes_py.wrappers import JoypadSpace import gym_tetris from gym_tetris.actions import MOVEMENT env = gym_tetris.make('TetrisA-v3') env = JoypadSpace(env, MOVEMENT) num_bins = 12 MOVELEFT = 6 MOVERIGHT = 3 MOVEDOWN = 9 RENDER = True P = { 0: { # I 0: [(0,0), (1,0), (2,0), (3,0)], 90: [(0,0), (0,1), (0,2), (0,3)], 180: [(3,0), (2,0), (1,0), (0,0)], 270: [(1,3), (1,2), (1,1), (1,0)], }, 1: { # T 0: [(1,0), (0,1), (1,1), (2,1)], 90: [(0,1), (1,2), (1,1), (1,0)], 180: [(1,1), (2,0), (1,0), (0,0)], 270: [(1,1), (0,0), (0,1), (0,2)],
action='store_true', help='Store the model') parser.add_argument("--save_freq", default=5e4, type=int, help="How often the models' weights are saved") args = parser.parse_args() # Create a store path for results and debug_summaries save_time = dt.datetime.now().strftime("%Y%m%d-%H%M%S") reward_writer = tf.summary.create_file_writer('./logs/' + save_time) # Initialise the environment env = gym_super_mario_bros.make(args.env) env = JoypadSpace(env, RIGHT_ONLY) env = wrapper(env) num_actions = env.action_space.n observation_space = args.frame_size num_frames = 4 # Initialise the agent kwargs = { "observation_space": observation_space, "num_actions": num_actions, "num_frames": num_frames, "delay_timesteps": args.delay_timesteps, "beta_decay_iter": args.beta_decay, "min_epsilon": args.min_epsilon, "epsilon_decay_iter": args.epsilon_decay,
showEnviornment = False episodeNum = 0 # if gpu is to be used use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") print(device) Tensor = torch.Tensor LongTensor = torch.LongTensor env = gym_super_mario_bros.make('SuperMarioBros-v0') #env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) env = JoypadSpace(env, SIMPLE_MOVEMENT) #directory = './MarioVideos/' directory = './MarioVideosLong/' env = gym.wrappers.Monitor( env, directory, video_callable=lambda episode_id: episode_id % 5000 == 0) seed_value = 23 env.seed(seed_value) torch.manual_seed(seed_value) random.seed(seed_value) ###### PARAMS ###### learning_rate = 0.0001 #num_episodes = 5000 num_episodes = 9999999999
def run_agent(agent, rendering=False, monitoring=False, print_reward=False): env = gym_super_mario_bros.make("SuperMarioBros-v0") env = JoypadSpace(env, SIMPLE_MOVEMENT) env.seed(42) if monitoring: env = Monitor(env, './video', force=True) agent.eval() state = env.reset() if rendering: env.render() #Conv2d without flatten() state = convert_image(state) #.flatten() state_list = [state, state, state, state] position = -1 global_reward = 0 s = 0 for _ in range(10000): #Conv2d input input = torch.from_numpy(np.array(state_list)).type('torch.FloatTensor')\ .unsqueeze(0) #Linear input #input = torch.tensor(state_list).type("torch.FloatTensor").view(1,-1) output_probabilities = agent(input).detach().numpy()[0] action = np.random.choice(range(action_count), 1, \ p=output_probabilities).item() new_state, reward, done, info = env.step(action) global_reward += reward s = s + 1 if rendering: env.render() state_list.pop() #Conv2d without flatten() state_list.append(convert_image(new_state)) #.flatten()) # if mario gets stuck, it gets punished and the loop gets broken if position == info["x_pos"]: stuck += 1 if stuck == 100: global_reward -= 100 break else: stuck = 0 position = info["x_pos"] #env.render() #Mario died if info["life"] < 2: break if print_reward: print(global_reward) return global_reward
argument_parser.add_argument("-b", "--batch-size", type=int, default=64) argument_parser.add_argument("-l", "--length", type=int, default=None) argument_parser.add_argument("--enable-cuda", action="store_true") args = argument_parser.parse_args() if args.enable_cuda: if torch.cuda_is_available(): device = "cuda" else: device = "cpu" warnings.warn("cuda is ot available. Defaulting to cpu") else: device = "cpu" env = gym_super_mario_bros.make("SuperMarioBros-v0") env = JoypadSpace(env, SIMPLE_MOVEMENT) env = MarioEnv(env) agent = DQN("cnn", env, replay_size=100000, epsilon_decay=100000) trainer = AdversariaTrainer( agent=agent, env=env, dataset=args.input_path, possible_actions=SIMPLE_MOVEMENT, device=device, length=args.length, off_policy=True, evaluate_episodes=1, ) trainer.train(epochs=args.epochs, lr=args.lr, batch_size=args.batch_size) trainer.evaluate(render=True)
from nes_py.wrappers import JoypadSpace import gym from Contra.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY env = gym.make('Contra-v0') env = JoypadSpace(env, RIGHT_ONLY) print("actions", env.action_space) print("observation_space ", env.observation_space.shape[0]) done = False a = env.reset() print("a ", a) for step in range(5000): if done: print("Over") break state, reward, done, info = env.step(env.action_space.sample()) # print("state ", state) # print("reward ", reward) # print("Done ", done) print("score ", info['score']) env.render() env.close()
import gym import gym_super_mario_bros from gym.wrappers import FrameStack, GrayScaleObservation, TransformObservation from nes_py.wrappers import JoypadSpace from metrics import MetricLogger from agent import Mario from wrappers import ResizeObservation, SkipFrame # Initialize Super Mario environment env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') # Limit the action-space to # 0. walk right # 1. jump right env = JoypadSpace(env, [['right'], ['right', 'A']]) # Apply Wrappers to environment env = SkipFrame(env, skip=4) env = GrayScaleObservation(env, keep_dim=False) env = ResizeObservation(env, shape=84) env = TransformObservation(env, f=lambda x: x / 255.) env = FrameStack(env, num_stack=4) env.reset() save_dir = Path('checkpoints') / datetime.datetime.now().strftime( '%Y-%m-%dT%H-%M-%S') save_dir.mkdir(parents=True) checkpoint = None # Path('checkpoints/2020-10-21T18-25-27/mario.chkpt')
LEARNING_STARTS = 50000 LEARNING_FREQ = 4 TARGER_UPDATE_FREQ = 10000 LEARNING_RATE = 0.00075 ALPHA = 0.95 ALPHA_P = 0.6 EPS = 0.01 env = gym_super_mario_bros.make('SuperMarioBros-1-1-v1') env.seed(SEED) torch.manual_seed(SEED) np.random.seed(SEED) random.seed(SEED) env = wrap_deepmind(env) env = JoypadSpace(env, COMPLEX_MOVEMENT) expt_dir = 'Game_video' env = wrappers.Monitor(env, expt_dir, force=True, video_callable=False) optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(2000000, 0.05, 0.05) annelation_schedule = LinearSchedule(2000000, 1.0, 0.4) dqn_learn( env=env, q_func=DQN, optimizer_spec=optimizer_spec,
from nes_py.wrappers import JoypadSpace import gym_tetris from gym_tetris.actions import MOVEMENT env = gym_tetris.make('TetrisA-v0') env = JoypadSpace(env, MOVEMENT) done = True for step in range(5000): if done: state = env.reset() state, reward, done, info = env.step(env.action_space.sample()) env.render() env.close()
# figsize = (15., 5. * len(keys)) # f, axarr = plt.subplots(len(keys), sharex=True, figsize=figsize) # for idx, key in enumerate(keys): # axarr[idx].plot(episodes, data[key]) # axarr[idx].set_ylabel(key) # plt.xlabel('episodes') # plt.tight_layout() # if output is None: # plt.show() # else: # plt.savefig(output) ENV_NAME = 'CustomContra-v2' # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) env = JoypadSpace(env, CUSTOM_MOVEMENT) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n print(nb_actions) print(env.observation_space.shape) obs_dim = env.observation_space.shape[0] # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16))
len(MOVEMENT)])), axis=0) highest_objective = info['objective'] else: state, reward, done, info = env.step(0) # env.render() def get_discrete_state(state): x_i = state[0] // 16 y_i = (state[1] - 61) // 16 return tuple((x_i, y_i)) env = gym_zelda_1.make('Zelda1-v0') env = JoypadSpace(env, MOVEMENT) # Create models folder if not os.path.isdir('Q_tables'): os.makedirs('Q_tables') # The area where Link can be is approximately 255*175 pixels (x:0-255, y:64-239). # If we divide these dimensions by 16, we get a (16, 11) matrix. This matrix will represent each discrete position Link can be in, # and for each of these discrete positions, he can perform len(MOVEMENT) distinct actions. Therefore, the Q matrix will have the dimensions [16,11,len(MOVEMENT)]. Q = np.random.uniform(low=-15, high=15, size=([1, 16, 11, len(MOVEMENT)])) start_in_level_1 = 0 state = env.reset() LEARNING_RATE = 0.1 DISCOUNT = 0.95
def test_env(env, model, device, deterministic=True): env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) env = RewardScalar(env) env = WarpFrame(env) env = FrameStack(env, 4) env = StochasticFrameSkip(env, 4, 0.5) env = ScaledFloatFrame(env) # env=gym.wrappers.Monitor(env, 'recording/PPORB5/{}'.format(str(num)), video_callable=lambda episode_id: True, force=True) state = env.reset() done = False total_reward = 0 distance = [] print("yes") for i in range(2000): state = torch.FloatTensor(state).to(device) state = state.float() state = state.permute(3, 0, 1, 2) dist, _ = model(state) policy = dist policy = Categorical(F.softmax(policy, dim=-1).data.cpu()) actionLog = policy.sample() action = actionLog.numpy() next_state, reward, done, info = env.step(action[0]) distance.append(info['x_pos']) state = next_state total_reward += reward env.render() print(total_reward) print(max(distance))
class Game: def __init__(self, game_id, obs_size, skip_frame=4, mode='train'): self.game_id = game_id env = gym_super_mario_bros.make(game_id) temp_obs = env.reset() height, width, _ = temp_obs.shape self.env = JoypadSpace(env, COMPLEX_MOVEMENT) self.obs_last2max = np.zeros((2, obs_size, obs_size, 1), np.uint8) self.obstack = np.zeros((obs_size, obs_size, 4)) self.rewards = [] self.lives = 2 self.skip = skip_frame self.mode = mode if self.mode == 'play': self.monitor = Monitor(width=width, height=height) def step(self, action, monitor=False): print(self.lives) reward = 0.0 done = False for i in range(self.skip): obs, r, done, info = self.env.step(action) if self.mode == 'play': print('Take Action: \t', COMPLEX_MOVEMENT[action]) self.monitor.record(obs) if i >= 2: self.obs_last2max[i % 2] = self._process_obs(obs) # super mario's reward is cliped in [-15.0, 15.0] reward += r / 15.0 lives = info['life'] if lives < self.lives: print(lives, self.lives) done = True print(done) self.lives = lives if done: break self.rewards.append(reward) if done: episode_info = { "reward": sum(self.rewards), "length": len(self.rewards) } self.reset() else: episode_info = None obs = self.obs_last2max.max(axis=0) self.obstack = np.roll(self.obstack, shift=-1, axis=-1) self.obstack[..., -1:] = obs return self.obstack, reward, done, episode_info def reset(self): obs = self.env.reset() obs = self._process_obs(obs) self.obstack[..., 0:] = obs self.obstack[..., 1:] = obs self.obstack[..., 2:] = obs self.obstack[..., 3:] = obs self.rewards = [] self.lives = 2 return self.obstack @staticmethod def _process_obs(obs): obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA) return obs[:, :, None]
from ann_utils.agents.curiosity_driven_agent import Curiosity_AC_GPT_Agent from ann_utils.models.specialists.vae_transformer_model import GPT2_Curiosity_AC # auxiliares from ann_utils.manager import tf_global_initializer, tf_load, tf_save from ann_utils.sess import TfSess from tqdm import tqdm config = tf.ConfigProto(log_device_placement=False) config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.9 sess = TfSess("mario_ac_transformer", gpu=True, config=config) env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, COMPLEX_MOVEMENT) state_info = env.reset() action_info = env.action_space.sample() action_size = env.action_space.n chp = "./saved/mario/" print('states len {}'.format(state_info.shape)) print('actions len {}'.format(action_size)) size = [192, 192, 3] bs = 8 state_size = 256 sequence_size = 6 model = GPT2_Curiosity_AC(action_size, 256, 128, 3, 4)
] global_agent = Curiosity_AC_Context_Agent(model, 1000, bs, 100000).build_agent_brain( i_s, s_s, act_s, dvc[0], dvc[2], sess, True, False, True, 'mario_global', False, None) workers = [ Curiosity_AC_Context_Agent(model, 1000, bs, 100000).build_agent_brain( i_s, s_s, act_s, dvc[0], dvc[2], sess, False, False, False, 'mario_local_{}'.format(w), True, global_agent.model.variables) for w in range(num_worker) ] genv = JoypadSpace(gym_super_mario_bros.make('SuperMarioBros-v0'), SIMPLE_MOVEMENT) envs = [ JoypadSpace(gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0'), SIMPLE_MOVEMENT) for x in range(num_worker) ] agents_controller = A3C(genv, envs, global_agent, workers, global_writer, num_local_steps=bs) tf_global_initializer(sess) global_writer.add_graph(sess.get_session().graph)
def __init__(self, config): self._config = config def get_available_gpus(): local_device_protos = device_lib.list_local_devices() return [ x.name for x in local_device_protos if x.device_type == 'GPU' ] print("GPU Available: ", tf.test.is_gpu_available()) # GPU configuration gpus = tf.config.experimental.list_physical_devices('GPU') tf.config.threading.set_inter_op_parallelism_threads(0) tf.config.threading.set_intra_op_parallelism_threads(0) if gpus: try: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) except RuntimeError as e: print(e) env_names = [] for _ in range(self._config['Number of worker threads']): env_names.append(self._config['Environment Name']) # Configuration # current_dir = os.getcwd() self._model_save_path = '.\Model' self._video_save_path = '.\Videos' self.record = True # Make the super mario gym environments and apply wrappers self._envs = [] collector = Collector() collector.set_dimensions( ["CMA", "EMA", "SMA", "LENGTH", "LOSS", 'TOTAL_EPISODE_REWARDS']) self._plot = AsynchronousPlot(collector, live=False) # Apply env wrappers counter = 0 for env_name in env_names: env = gym.make(env_name) if env_name == 'SuperMarioBros-v0': env = JoypadSpace(env, COMPLEX_MOVEMENT) # Load wrapper class env = Stats(env, collector) if self._config['Wrapper class'] != '': env = env_wrapper_import(self._config['Wrapper class'], env) env = Monitor(env, env.observation_space.shape, savePath=self._video_save_path, record=self.record) env = preprocess.GrayScaleImage( env, height=84, width=84, grayscale=self._config['Grayscale']) env = preprocess.FrameStack(env, 4) self._envs.append(env) self.NUM_STATE = self._envs[0].observation_space.shape self.NUM_ACTIONS = self._envs[0].env.action_space.n self.ACTION_SPACE = self._envs[0].env.action_space if not os.path.exists(self._video_save_path): os.makedirs(self._video_save_path) if not os.path.exists(self._model_save_path): os.makedirs(self._model_save_path) if not os.path.exists('.\stats'): os.makedirs('.\stats')
def __len__(self): return len(self._force()) def __getitem__(self, i): return self._force()[i] def count(self): frames = self._force() return frames.shape[frames.ndim - 1] def frame(self, i): return self._force()[..., i] env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = JoypadSpace(env, COMPLEX_MOVEMENT) env = EpisodicLifeEnv(env) env = RewardScaler(env) env = PreprocessFrame(env) env = StochasticFrameSkip(env, 4, 0.5) env = ScaledFloatFrame(env) env = FrameStack(env, 4) def get_action(state, actions_type="deterministic"): if actions_type == "Stochastic": action_probability_distribution = (training_model.predict(state))[0] top_actions = action_probability_distribution.argsort()[-2:][::-1] action = random.choice(top_actions)
import numpy as np from pathlib import Path from collections import deque import random, os, copy, datetime import gym from gym.spaces import Box from gym.wrappers import FrameStack from nes_py.wrappers import JoypadSpace import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0") env = JoypadSpace(env, [["right"], ["right", "A"]]) env.reset() next_state, reward, done, info = env.step(action=0) print(f"{next_state.shape}, \n {reward }, \n {done}, \n {info}") class SkipFrame(gym.Wrapper): def __init__(self, env, skip): super().__init__(env) self._skip = skip def step(self, action): total_reward = 0 done = False for i in range(self._skip): obs, reward, done, info = self.env.step(action)
from model import generate_complex_model import tensorflow as tf import os import numpy as np import time import random from nes_py.wrappers import JoypadSpace import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT from auto_everything.base import IO io = IO() env = gym_super_mario_bros.make('SuperMarioBros-v2') env = JoypadSpace(env, SIMPLE_MOVEMENT) env.reset() model_file_path = './nn_model.HDF5' final_model_file_path = './final_nn_model.HDF5' if os.path.exists(model_file_path): model = tf.keras.models.load_model(model_file_path) else: model = generate_complex_model() def train_once(last_state, history_actions, history_x_pos, history_y_pos, action, reward): global model model.train_on_batch( x={ 'action': np.expand_dims(action, axis=0),
# super mairo 환경을 만들기 위해서는 꼭 gym_super_mario_bros를 import해야합니다. # gym_super_mario_bros 환경은 256의 모든 NES action space actions를 사용합니다 import gym_super_mario_bros from gym_super_mario_bros.smb_env import SuperMarioBrosEnv # NES action에 제약을 걸기 위해서는, gym_super_mario_bros.actions을 사용해야합니다. # 먼저 ~.actions는 아래의 3가지 action list를 제공합니다. # RIGHT_ONLY, SIMPLE_MOVEMENT, COMPLEX_MOVEMENT # 위 세가지는 nes_py.wrappers.JoypadSpace의 wrapper로 사용됩니다. # 그 중에서 여기는 SIMPLE_MOVEMENT를 사용하기로 하였습니다. # super_mario_bros 환경 만들기 env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') # action_space 재정의(nes-py) # env = JoypadSpace(env, SIMPLE_MOVEMENT) env = JoypadSpace(env, COMPLEX_MOVEMENT) # observation_space 재정의(gym) env = PreprocessFrame(env) # life cycle 재정의 # env = EpisodicLifeEnv(env) # reward 재정의 필요 episode = 3000 INITIAL_BUFFER_SIZE = 1000 # 초기 inital 값을 버퍼에 채우기 전까지는 학습하지 않습니다. # 최소의 explore를 하기 위한 value EPS = 1.00 # 3 프로의 확률로 랜덤 EPS_THRESHOLD = 0.01 # 무한번 반복 시켜도 1%의 최소 확률을 남겨놓음 EPS_DECAY = 0.99 # 0.99를 곱한다. # keep track of progress