def make_env_all_params(rank, add_monitor, args): if args["env_kind"] == 'atari': env = gym.make(args['env']) assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=args['noop_max']) env = MaxAndSkipEnv(env, skip=4) env = ProcessFrame84(env, crop=False) env = FrameStack(env, 4) env = ExtraTimeLimit(env, args['max_episode_steps']) if 'Montezuma' in args['env']: env = MontezumaInfoWrapper(env) env = AddRandomStateToInfo(env) elif args["env_kind"] == 'mario': env = make_mario_env() elif args["env_kind"] == "retro_multi": env = make_multi_pong() elif args["env_kind"] == 'robopong': if args["env"] == "pong": env = make_robo_pong() elif args["env"] == "hockey": env = make_robo_hockey() elif args["env_kind"] == "my_games": env = gym.make(args['env']) env = MaxAndSkipEnv(env, skip=4) env = WarpFrame(env) env = FrameStack(env, 4) if add_monitor: env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank)) return env
def make_env_all_params(rank, add_monitor, args): if args["env_kind"] == "atari": env = gym.make(args["env"]) assert "NoFrameskip" in env.spec.id # from self-supervised exploration via disagreement if args["stickyAtari"] == "true": env = StickyActionEnv(env) env._max_episode_steps = args["max_episode_steps"] * 4 env = MaxAndSkipEnv(env, skip=4) env = ProcessFrame84(env, crop=False) env = FrameStack(env, 4) env = ExtraTimeLimit(env, args["max_episode_steps"]) if "Montezuma" in args["env"]: env = MontezumaInfoWrapper(env) env = AddRandomStateToInfo(env) if args["noisy_tv"] == "true": env = NoisyTVEnvWrapper(env) # assert env.action_space == spaces.Discrete(7) elif args["env_kind"] == "mario": env = make_mario_env() if args["noisy_tv"] == "true": env = NoisyTVEnvWrapperMario(env) elif args["env_kind"] == "retro_multi": env = make_multi_pong() elif args["env_kind"] == "robopong": if args["env"] == "pong": env = make_robo_pong() elif args["env"] == "hockey": env = make_robo_hockey() if add_monitor: env = Monitor(env, osp.join(logger.get_dir(), "%.2i" % rank)) return env
def make_env_all_params(rank, add_monitor, args): if args["env_kind"] == 'atari': env = gym.make(args['env']) assert 'NoFrameskip' in env.spec.id if args["stickyAtari"]: # 在智能体执行动作时增加随机性 env._max_episode_steps = args['max_episode_steps'] * 4 env = StickyActionEnv(env) else: env = NoopResetEnv(env, noop_max=args['noop_max']) env = MaxAndSkipEnv(env, skip=4) # 每个动作连续执行4步 env = ProcessFrame84(env, crop=False) # 处理观测 env = FrameStack(env, 4) # 将连续4帧叠加起来作为输入 env = ExtraTimeLimit(env, args['max_episode_steps']) if not args["stickyAtari"]: env = ExtraTimeLimit(env, args['max_episode_steps']) # 限制了一个周期的最大时间步 if 'Montezuma' in args['env']: # 记录智能体的位置, 所在的房间, 已经访问的房间 env = MontezumaInfoWrapper(env) env = AddRandomStateToInfo(env) elif args["env_kind"] == 'mario': # 超级马里奥 env = make_mario_env() elif args["env_kind"] == "retro_multi": # 多智能体游戏, Multi-Pong env = make_multi_pong() elif args["env_kind"] == 'robopong': if args["env"] == "pong": env = make_robo_pong() elif args["env"] == "hockey": env = make_robo_hockey() if add_monitor: env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank)) return env
def make_env_all_params(rank, add_monitor, args, logdir): if args["env_kind"] == 'atari': env = gym.make(args['env']) assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=args['noop_max']) env = MaxAndSkipEnv(env, skip=4) env = ProcessFrame84(env, crop=False) env = FrameStack(env, 4) env = ExtraTimeLimit(env, args['max_episode_steps']) if 'Montezuma' in args['env']: env = MontezumaInfoWrapper(env) env = AddRandomStateToInfo(env) elif args["env_kind"] == 'mario': env = make_mario_env() elif args["env_kind"] == "retro_multi": env = make_multi_pong() elif args["env_kind"] == 'robopong': if args["env"] == "pong": env = make_robo_pong() elif args["env"] == "hockey": env = make_robo_hockey() elif args["env_kind"] == "dm_suite": env = make_dm_suite(task=args["env"], logdir=logdir, to_record=args["to_record"]) if add_monitor: env = TempMonitor(env) return env
def make_env_all_params(rank, add_monitor, args, sleep_multiple=2): if args["env_kind"] == 'ObstacleTowerEnv': env = _make_obs_env(rank, add_monitor, args, sleep_multiple) elif args["env_kind"] == 'atari': env = gym.make(args['env']) assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=args['noop_max']) env = MaxAndSkipEnv(env, skip=4) env = ProcessFrame84(env, crop=False) env = FrameStack(env, 4) env = ExtraTimeLimit(env, args['max_episode_steps']) if 'Montezuma' in args['env']: env = MontezumaInfoWrapper(env) env = AddRandomStateToInfo(env) if rank == 2: env = RenderWrapper(env) elif args["env_kind"] == 'mario': env = make_mario_env() elif args["env_kind"] == "retro_multi": env = make_multi_pong() elif args["env_kind"] == 'robopong': if args["env"] == "pong": env = make_robo_pong() elif args["env"] == "hockey": env = make_robo_hockey() if add_monitor: logdir = osp.join('summaries', args["exp_name"]) logger.configure(logdir) env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank)) return env
def make_env_all_params(rank, add_monitor, args): if args["env_kind"] == 'atari': env = gym.make(args['env']) assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=args['noop_max']) env = MaxAndSkipEnv(env, skip=4) env = ProcessFrame84(env, crop=False) env = FrameStack(env, 4) env = ExtraTimeLimit(env, args['max_episode_steps']) if 'Montezuma' in args['env']: env = MontezumaInfoWrapper(env) env = AddRandomStateToInfo(env) elif args["env_kind"] == 'mario': env = make_mario_env() elif args["env_kind"] == "retro_multi": env = make_multi_pong() elif args["env_kind"] == 'robopong': if args["env"] == "pong": env = make_robo_pong() elif args["env"] == "hockey": env = make_robo_hockey() if args["env_kind"] == 'atari' and add_monitor: #env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank)) env = Monitor(env, os.path.join(os.getcwd(), 'test_video'), force=True, video_callable=lambda episode_id: episode_id % 20 == 0) #env = Monitor(env, os.path.join(os.getcwd(), 'test_video'),video_callable=lambda episode_id: True )#,force=True) return env
def make_env_all_params(rank, add_monitor, args): if args["env_kind"] == 'atari': env = gym.make(args['env']) assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=args['noop_max']) env = MaxAndSkipEnv(env, skip=4) env = ProcessFrame84(env, crop=False) env = FrameStack(env, 4) env = ExtraTimeLimit(env, args['max_episode_steps']) if 'Montezuma' in args['env']: env = MontezumaInfoWrapper(env) env = AddRandomStateToInfo(env) elif args["env_kind"] == 'field': import gym_fieldedmove env = gym.make('FieldedMove-v0') # env = FrameStack(env, 4) elif args["env_kind"] == "ple": import gym_ple env = gym.make(args['env']) env._max_episode_steps = args['max_episode_steps'] # env = MaxAndSkipEnv(env, skip=4) env = ProcessFrame84(env, crop=False) env = FrameStack(env, 4) if add_monitor: env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank)) return env
def evaluate_in_environment(net): env = gym.make("Pong-v0") env = MaxAndSkipEnv(env, skip=4) env = PreproWrapper(env, prepro=greyscale, shape=(80, 80, 1), overwrite_render=True) evaluate(net, env=env)
def make_env_all_params(rank, add_monitor=True): env = gym.make("MontezumaRevengeNoFrameskip-v4") assert 'NoFrameskip' in env.spec.id env._max_episode_steps = 4500 * 4 env = StickyActionEnv(env) env = MaxAndSkipEnv(env, skip=4) # 每个动作连续执行4步 env = ProcessFrame84(env, crop=False) # 处理观测 env = FrameStack(env, 4) # 将连续4帧叠加起来作为输入 return env
def make_env_all_params(rank, args): env = gym.make(GAME_NAME) env = NoopResetEnv(env, noop_max=NOOP_MAX) env = MaxAndSkipEnv(env, skip=4) env = ProcessFrame84(env, crop=False) env = FrameStack(env, 4) # env = ExtraTimeLimit(env,10000) env = AddRandomStateToInfo(env) env = Monitor( env, os.path.join( 'C:/Users/Elias/OneDrive/Winfo Studium/SS19/Masterarbeit/logs', '%.2i' % rank)) return env
def make_env_all_params(rank, add_monitor, args): if args["env_kind"] == 'atari': env = gym.make(args['env']) assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=args['noop_max']) env = MaxAndSkipEnv(env, skip=4) env = ProcessFrame84(env, crop=False) env = FrameStack(env, 4) env = ExtraTimeLimit(env, args['max_episode_steps']) if 'Montezuma' in args['env']: env = MontezumaInfoWrapper(env) env = AddRandomStateToInfo(env) elif args["env_kind"] == 'mario': env = make_mario_env() elif args["env_kind"] == "retro_multi": env = make_multi_pong() elif args["env_kind"] == 'robopong': if args["env"] == "pong": env = make_robo_pong() elif args["env"] == "hockey": env = make_robo_hockey() if add_monitor: #print(osp.join(logger.get_dir(), '%.2i' % rank + '.monitor.csv')) env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank)) """ env = DummyVecEnv([lambda: env]) env = VecVideoRecorder(env, directory = './vid', record_video_trigger=lambda step: step == 0, video_length= 100,) env.reset() """ #env = wrappers.Monitor(env,'./vid/',force = True,write_upon_reset = True, video_callable=lambda episode: True) #print(osp.join(logger.get_dir())) #env = Monitor(env, osp.join(logger.get_dir())) #env = Monitor(env, "./vid", video_callable=lambda episode_id: True,force=True) return env
def make_atari_env(self, args): """ duplicated code hack due to relative import errors """ env = gym.make(args["env"]) assert "NoFrameskip" in env.spec.id # from self-supervised exploration via disagreement if args["stickyAtari"] == "true": env = StickyActionEnv(env) env._max_episode_steps = args["max_episode_steps"] * 4 env = MaxAndSkipEnv(env, skip=4) env = ProcessFrame84(env, crop=False) env = FrameStackNoLazy(env, 4) env = ExtraTimeLimit(env, args["max_episode_steps"]) if "Montezuma" in args["env"]: env = MontezumaInfoWrapper(env) env = AddRandomStateToInfo(env) if args["noisy_tv"] == "true": env = NoisyTVEnvWrapper(env) return env
def make_env_all_params(rank, add_monitor, args): if args["env_kind"] == 'atari': env = gym.make(args['env']) assert 'NoFrameskip' in env.spec.id if args["stickyAtari"]: env._max_episode_steps = args['max_episode_steps'] * 4 env = StickyActionEnv(env) else: env = NoopResetEnv(env, noop_max=args['noop_max']) env = MaxAndSkipEnv(env, skip=4) env = ProcessFrame84(env, crop=False) env = FrameStack(env, 4) if not args["stickyAtari"]: env = ExtraTimeLimit(env, args['max_episode_steps']) if 'Montezuma' in args['env']: env = MontezumaInfoWrapper(env) env = AddRandomStateToInfo(env) elif args["env_kind"] == 'mario': env = make_mario_env() elif args["env_kind"] == "retro_multi": env = make_multi_pong() elif args["env_kind"] == 'unity': env = make_unity_maze(args["env"], seed=args["seed"], rank=rank, ext_coeff=args["ext_coeff"], recordUnityVid=args['recordUnityVid'], expID=args["unityExpID"], startLoc=args["startLoc"], door=args["door"], tv=args["tv"], testenv=args["testenv"], logdir=logger.get_dir()) if add_monitor: env = Monitor(env, osp.join(logger.get_dir(), '%.2i' % rank)) return env
def main(): #Make OpenAI gym environment + wrappers date_time = now.strftime("_%H:%M:%S_%m-%d-%Y") env = gym.make("PongNoFrameskip-v4") env = gym.wrappers.Monitor(env, './data_dqn_ataripong' + date_time) assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) #skip 4 frames & max over last_obs env = wrap_deepmind(env) env = wrap_pytorch(env) #obs shape = num_channels x width x height obs_space_shape = env.observation_space.shape[0] action_space_shape = env.action_space.n #Set random seeds seed = 6582 torch.manual_seed(seed) if torch.cuda.is_available: torch.cuda.manual_seed(seed) np.random.seed(seed) random.seed(seed) env.seed(seed) #Initialize Replay Memory (Line 1) replay_memory = ReplayMemory(max_size=100000) #Make Q-Network and Target Q-Network (Lines 2 & 3) qnet = Atari_Dueling_DQN(obs_space_shape, action_space_shape).to(device) target_qnet = Atari_Dueling_DQN(obs_space_shape, action_space_shape).to(device) target_qnet.load_state_dict(qnet.state_dict()) #Training Parameters (Changes from Mnih et al. outlined in README.md) optimizer = optim.Adam(qnet.parameters()) num_frames = 1400000 gamma = 0.99 replay_start_size = 50000 target_network_update_freq = 10000 #Train obs = env.reset() num_episodes = 0 for t in range(1, num_frames + 1): epsilon = epsilon_at_t(t) #------------------------------------------------------------------- #Take one step in the environment & add to Replay Memory (Line 7-11) #------------------------------------------------------------------- torch.set_grad_enabled(False) #Select action with epsilon-greedy exploration (Line 7,8) if random.random() > epsilon: ts_obs = torch.from_numpy(obs.astype( np.float32)).unsqueeze(0).to(device) ts_qvals = qnet(ts_obs) action = ts_qvals.max(-1)[1].item() else: action = random.randrange(action_space_shape) torch.set_grad_enabled(True) #Execute action and get reward + next_obs (Line 9, 10) next_obs, reward, done, _ = env.step(action) #Store transition in Replay Memory replay_memory.add(obs, next_obs, action, reward, done) obs = next_obs if done: obs = env.reset() num_episodes += 1 #Populate Replay Memory with <replay_start_size> experiences before learning if t > replay_start_size: #--------------------------------------------- #Sample batch & compute loss & update network (Lines 12 - 15) #--------------------------------------------- obs_minibatch, next_obs_minibatch, actions_minibatch, rewards_minibatch, done_minibatch = replay_memory.sample( ) ts_obs, ts_rewards, ts_next_obs, ts_done = map( lambda x: torch.FloatTensor(x).to(device), [ obs_minibatch, rewards_minibatch, next_obs_minibatch, done_minibatch ]) ts_actions = torch.LongTensor(actions_minibatch).to(device) torch.set_grad_enabled(False) # Compute Target Values (as per Double-DQN update rule) ts_next_qvals_outer = qnet( ts_next_obs) #(32, 2) (outer Qnet, evaluates value) ts_next_qvals_inner = target_qnet( ts_next_obs) #(32, 2) (inner Qnet, evaluates action) ts_next_action_inner = ts_next_qvals_inner.argmax( -1, keepdim=True) #(32, 1) ts_next_action_qvals_outer = ts_next_qvals_outer.gather( -1, ts_next_action_inner).view( -1) #(32, ) (use inner actions to evaluate outer Q values) ts_target_q = ts_rewards + gamma * ts_next_action_qvals_outer * ( 1 - ts_done) torch.set_grad_enabled(True) #Compute predicted ts_pred_q = qnet(ts_obs).gather(-1, ts_actions).view(-1) #(32,) #Calculate Loss & Perform gradient descent (Line 14) loss = F.smooth_l1_loss(ts_pred_q, ts_target_q) optimizer.zero_grad() loss.backward() optimizer.step() #Update target network ever <target_network_update_freq> steps (Line 15) if t % target_network_update_freq == 0: target_qnet.load_state_dict(qnet.state_dict()) #Log to Terminal episode_rewards = env.env.env.env.env.env.env.env.get_episode_rewards() print('Timesteps', t, 'Episode', num_episodes, 'Mean Reward', np.mean(episode_rewards[-100:])) env.env.close()
ActionLoggingWrapper, ) def get_car_mask(frames, car_color=np.array([223, 183, 85])): mask = np.zeros(shape=frames[0].shape) for a_frame in frames: for i in range(a_frame.shape[0]): for j in range(a_frame.shape[1]): if np.array_equal(a_frame[i][j], car_color): mask[i][j] += 1 return mask env = gym.make("BankHeistNoFrameskip-v4") env = gym.wrappers.Monitor(env, "./video/", force=True) env._max_episode_steps = 4000 * 4 env = MaxAndSkipEnv(env, skip=4) env = ProcessFrame84(env, crop=False) env = FrameStack(env, 4) env = ExtraTimeLimit(env, 4000) env = AddRandomStateToInfo(env) obs = env.reset() for _ in range(100): obs, reward, done, info = env.step(env.action_space.sample()) import pdb pdb.set_trace()