def train(env_id, num_timesteps, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) #env = make_atari(env_id) env = gym_super_mario_bros.make('SuperMarioBros-v1') # env = gym_super_mario_bros.make('SuperMarioBrosNoFrameskip-v3') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) env = ProcessFrame84(env) env = FrameMemoryWrapper(env) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) #env = wrap_deepmind(env) env.seed(workerseed) def render_callback(lcl, _glb): # print(lcl['episode_rewards']) total_steps = lcl['env'].total_steps #if total_steps % 1000 == 0: # print("Saving model to mario_model.pkl") # act.save("../models/mario_model_{}.pkl".format(modelname)) env.render() # pass pposgd_simple.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, # 3e-4 optim_batchsize=64, #256 gamma=0.99, lam=0.95, schedule='linear', callback = render_callback ) env.close()
exploration=exploration_schedule, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ ) if __name__ == '__main__': # Initialize the environment using gym_super_mario_bros env = gym_super_mario_bros.make('SuperMarioBros-1-1-v3') env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT) # set global seeds env.seed(SEED) torch.manual_seed(SEED) np.random.seed(SEED) random.seed(SEED) # monitor & wrap the game env = wrap_mario(env) expt_dir = 'video/mario' env = wrappers.Monitor(env, expt_dir, force=True, video_callable=lambda count: count % 10 == 0) # main main(env)
['NOOP'], ['right'], ['right', 'A'], ['right', 'B'], ['right', 'A', 'B'], ['A'], ['left'], ]""" state_size = env.observation_space.shape action_size = env.action_space.n print("state_size", state_size) print("action_size", action_size) """Set random seed""" env.seed(random_seed) np.random.seed(random_seed) tf.set_random_seed(random_seed) # create dqn agent sess = tf.Session() dqn = DQNAgent(sess, state_size, action_size) sess.run(tf.global_variables_initializer()) if args.load_from is not None: dqn.load_model(args.load_from) def train(): total_step = 1
def process_reward(self, reward): return np.clip(reward, -1., 1.) parser = argparse.ArgumentParser() parser.add_argument('--mode', choices=['train', 'test'], default='train') parser.add_argument('--env-name', type=str, default='SuperMarioBros-v0') parser.add_argument('--weights', type=str, default=None) args = parser.parse_args() # Get the environment and extract the number of actions. env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n # Next, we build our model. We use the same model that was described by Mnih et al. (2015). input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE model = Sequential() if K.image_dim_ordering() == 'tf': # (width, height, channels) model.add(Permute((2, 3, 1), input_shape=input_shape)) elif K.image_dim_ordering() == 'th': # (channels, width, height) model.add(Permute((1, 2, 3), input_shape=input_shape)) else: raise RuntimeError('Unknown image_dim_ordering.') model.add(Convolution2D(32, (8, 8), strides=(4, 4))) model.add(Activation('relu'))
use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") Tensor = torch.Tensor LongTensor = torch.LongTensor env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) directory = './resVideos/' env = gym.wrappers.Monitor(env, directory, video_callable=lambda episode_id: episode_id%1==0) seed_value = 23 env.seed(seed_value) torch.manual_seed(seed_value) random.seed(seed_value) ###### PARAMS ###### learning_rate = 0.0001 num_episodes = 5000 startNum = 500 newModel = False gamma = 0.99 hidden_layer = 512 replay_mem_size = 100000 batch_size = 32