def make_atari_env(env_id, seed, monitor=False, monitor_dir=None): r"""Create Atari environment with all necessary preprocessings. Args: env_id (str): Atari game name without version, e.g. Pong, Breakout seed (int): random seed for the environment monitor (bool, optional): If ``True``, then wrap the enviroment with Monitor for video recording. monitor_dir (str, optional): directory to save all data from Monitor. Returns ------- env : Env lagom-compatible environment """ env = gym.make(env_id + 'NoFrameskip-v4') # remove gym TimeLimit wrapper (caps 100k frames), we want to cap 108k frames (30 min) env = env.env if monitor: env = Monitor(env, monitor_dir) env = GymWrapper(env) env = ResizeObservation(env, 84) env = GrayScaleObservation(env, keep_dim=False) env = AtariPreprocessing(env) env = ScaleImageObservation(env) env = ClipReward(env) env = FrameStack(env, 4) env.seed(seed) return env
def setup_env_agent(env, monitor, reward_shaping, frame_stack, train): env = gym.make(env) if monitor: if not os.path.exists('./monitor_dir'): os.makedirs('./monitor_dir') env = Monitor(env, './monitor_dir/', force=True) if reward_shaping and train: # only shape reward when training and when stipulated reward_shaping = True else: reward_shaping = False env = RewardClipWrapper(env, reward_shaping) if len( env.observation_space.shape ) == 1: # if we have rank of 1, it's a 1D space, so no need for convolutions conv = False input_dim = int(env.observation_space.shape[0]) else: conv = True # otherwise it's an image, so we'll have to add these input_dim = 84 env.seed(0) agent = DQNAgent(env.action_space, frame_stack=frame_stack, conv=conv, input_dim=input_dim) return env, agent
def make_gym_env(env_id, seed, monitor=False, monitor_dir=None): r"""Create an OpenAI Gym environment, and wrap it into lagom-compatible :class:`Env`. Example:: >>> env = make_gym_env(env_id='CartPole-v1', seed=1, monitor=False) >>> env <GymWrapper, <TimeLimit<CartPoleEnv<CartPole-v1>>>> >>> env.reset() array([ 0.03073904, 0.00145001, -0.03088818, -0.03131252]) Args: env_id (str): OpenAI Gym environment ID, e.g. 'Pendulum-v0', 'Ant-v2' seed (int): random seed for the environment monitor (bool, optional): If ``True``, then wrap the enviroment with Monitor for video recording. monitor_dir (str, optional): directory to save all data from Monitor. Returns ------- env : Env lagom-compatible environment """ env = gym.make(env_id) if monitor: env = Monitor(env, monitor_dir) env = GymWrapper(env) env.seed(seed) return env
def main(): """ Orchestrates agent and environment interactions. """ # Create environment environment = gym.make(ENVIRONMENT) if RECORD: environment = Monitor(env=environment, directory=VIDEO_DIRECTORY, video_callable=lambda episode_id: True, force=True) # Set random seeds environment.seed(0) np.random.seed(0) # Get action and state space sizes action_space = environment.action_space.n state_space = environment.observation_space.shape[0] # Instantiate agent agent = Agent(action_space, state_space) # Load model weights if path.exists(CHECKPOINT_DIRECTORY): agent.load(CHECKPOINT_DIRECTORY) # Initialise list of all rewards rewards = [] for episode in range(EPISODES): # Get initial state state = environment.reset() state = np.reshape(state, (1, state_space)) # Reset score for this episode score = 0 for _ in range(STEPS): if RENDER: environment.render() # Agent selects action from state action = agent.act(state) # Agent performs action and makes an observation of the environment next_state, reward, done, _ = agent.observe(environment, action) next_state = np.reshape(next_state, (1, state_space)) observation = (state, action, reward, next_state, done) # Agent remembers parameters of this time step agent.remember(observation) state = next_state # Agent retrains model agent.learn() score += reward if done: print("Episode: {}/{}. Reward: {:.2f}".format( episode + 1, EPISODES, score)) break rewards.append(score) # Average reward over the last 100 episodes average_reward = np.mean(rewards[-100:]) print("Average reward: {:.2f}\n".format(average_reward)) # Terminate environment environment.close() # Save model agent.save(CHECKPOINT_DIRECTORY) # Display performance over time summary(rewards)
def thunk(): env = gym.make(gym_id) env = gym.wrappers.RecordEpisodeStatistics(env) if args.capture_video: if idx == 0: env = Monitor(env, f'videos/{experiment_name}') env.seed(seed) env.action_space.seed(seed) env.observation_space.seed(seed) return env
def make_env(seed=None, monitor=False, monitor_dir=None): env = gym.make('Acrobot-v1') if monitor: env = Monitor(env, directory=monitor_dir) env = GymEnv(env) if seed is not None: env.seed(seed) return env
def make_env(env_id, use_monitor=False, monitor_dir='recordings', seed=None): """Instantiates the OpenAI Gym environment Args: env_id (string): OpenAI Gym environment ID use_monitor (bool): whether or not to use gym.wrappers.Monitor seed (int) """ env = gym.make(env_id) # instantiate the environment if use_monitor: env = Monitor(env, monitor_dir) env.seed(seed) return env
def main(): config = EvolutionConfig() env = gym.make('LunarLander-v2') env = Monitor(env, '/tmp/evolution', force=True) env.seed(config.seed) np.random.seed(config.seed) policy = EvolutionPolicy(env, config) # Continue training #policy.W = np.load('weights.npy') rewards, n_generations = train(policy, config) env.close() create_plot(rewards, n_generations)
def get_new_env(env_name, cmdl): """Configure the training environment and return an instance.""" import logging import gym import gym_fast_envs # noqa from gym.wrappers import Monitor # Undo the default logger and configure a new one. gym.undo_logger_setup() logger = logging.getLogger() logger.setLevel(logging.WARNING) # Configure environment outdir = '/tmp/nec/%s-results' % cmdl.label env = gym.make(env_name) env = Monitor(env, directory=outdir, force=True, video_callable=False) env.seed(cmdl.seed) return env
def thunk(): env = gym.make(gym_id) env = wrap_atari(env) env = gym.wrappers.RecordEpisodeStatistics(env) if args.capture_video: if idx == 0: env = Monitor(env, f'videos/{experiment_name}') env = wrap_pytorch( wrap_deepmind( env, clip_rewards=True, frame_stack=True, scale=False, )) env.seed(seed) env.action_space.seed(seed) env.observation_space.seed(seed) return env
def _thunk(): random_seed(seed) if env_id.startswith('bsuite'): id = env_id.split('bsuite-')[1] self.video_enabled = False bsuite_env = bsuite.load_from_id(id) env = gym_wrapper.GymFromDMEnv(bsuite_env) elif env_id.startswith("dm"): import dm_control2gym _, domain, task = env_id.split('-') env = dm_control2gym.make(domain_name=domain, task_name=task) else: if special_args is not None: if 'NChain' in special_args[0]: print('starting chain N = ', special_args[1]) env = gym.make(env_id, n=special_args[1]) else: env = gym.make(env_id) if self.video_enabled: env = Monitor(env, self.log_dir, video_callable=self.video_callable) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) env = OriginalReturnWrapper(env) if is_atari: env = wrap_deepmind(env, episode_life=episode_life, clip_rewards=False, frame_stack=False, scale=False) obs_shape = env.observation_space.shape if len(obs_shape) == 3: env = TransposeImage(env) env = FrameStack(env, 4) return env
def thunk(): env = gym.make(gym_id) env = wrap_atari(env, sticky_action=args.sticky_action) env = gym.wrappers.RecordEpisodeStatistics(env) if args.capture_video: if idx == 0: env = Monitor(env, f'videos/{experiment_name}', video_callable=lambda episode_id: episode_id%args.video_interval==0) env = wrap_pytorch( wrap_deepmind( env, episode_life=True, clip_rewards=True, frame_stack=True, scale=False, ) ) env.seed(seed) env.action_space.seed(seed) env.observation_space.seed(seed) return env
def _thunk(): if env_id.startswith("dm"): _, domain, task = env_id.split('.') env = dm_control2gym.make(domain_name=domain, task_name=task) else: env = gym.make(env_id) if save_video is not None: env = Monitor(env, save_video, force=True) env.seed(seed + rank) obs_shape = env.observation_space.shape if add_timestep and len( obs_shape) == 1 and str(env).find('TimeLimit') > -1: env = AddTimestep(env) # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 2, 3]: env = WrapPyTorch(env) return env
def make_gym_env(env_id, seed, monitor=False, monitor_dir=None): """ Create a gym environment, wrap it with GymEnv and seed it. Args: env_id (str): environment ID, e.g. 'Pendulum-v0' seed (int): random seed monitor (bool, optional): Whether to wrap the environment with Monitor for video recording. monitor_dir (str, optional): directory to save all data from Monitor. Returns: env (GymEnv): created environment """ # Create gym environment env = gym.make(env_id) # Wrap the enviroment with Monitor if required if monitor: env = Monitor(env, monitor_dir) # Wrap the environment with GymEnv env = GymEnv(env) # Seed the environment env.seed(seed) return env
makedirs(output_dir) # register environment in Gym according to env_config helper.register_gym_environment(True, FPS, SHOW_SCORE_BAR) # saves / copies configs to file config.save_json(join(output_dir, 'config.json')) helper.save_state_features(join(output_dir, 'state_features.csv')) # create environment and monitor env = gym.make(config.gym_env_id) env = Monitor(env, directory=output_dir, force=True, video_callable=lambda _: True) env.seed(config.seed) # adds reference to monitor to allow for gym environments to update video frames env.env.env.monitor = env # create the agent agent, exploration_strategy = create_agent(helper, AgentType.Manual, None) behavior_tracker = BehaviorTracker(config.num_episodes) # tries to load agent info if agent_t != AgentType.Manual and exists(agent_dir): agent.load(agent_dir) window_still_open = True e = 0 save_features = save_environment = False
def collect_data(cfg, plot=False): # Creates horizon^2/2 points """ Collect data for environment model :param nTrials: :param horizon: :return: an array of DotMaps, where each DotMap contains info about a trajectory """ env_model = cfg.env.name env = gym.make(env_model) if (cfg.video): env = Monitor(env, hydra.utils.get_original_cwd() + '/trajectories/reacher/video', video_callable = lambda episode_id: episode_id==1,force=True) log.info('Initializing env: %s' % env_model) # Logs is an array of dotmaps, each dotmap contains 2d np arrays with data # about <horizon> steps with actions, rewards and states logs = [] if (cfg.PID_test): target = np.random.rand(5) * 2 - 1 for i in range(cfg.num_trials): log.info('Trial %d' % i) if (cfg.PID_test): env.seed(0) else: env.seed(i) s0 = env.reset() # P = np.array([4, 4, 1, 1, 1]) P = np.random.rand(5) * 5 I = np.zeros(5) # D = np.array([0.2, 0.2, 2, 0.4, 0.4]) D = np.random.rand(5) # Samples target uniformely from [-1, 1] if (not cfg.PID_test): target = np.random.rand(5) * 2 - 1 policy = PID(dX=5, dU=5, P=P, I=I, D=D, target=target) # print(type(env)) dotmap = run_controller(env, horizon=cfg.trial_timesteps, policy=policy, video = cfg.video) dotmap.target = target dotmap.P = P / 5 dotmap.I = I dotmap.D = D logs.append(dotmap) if plot: import plotly.graph_objects as go fig = go.Figure() fig.update_layout( width=1500, height=800, autosize=False, scene=dict( camera=dict( up=dict( x=0, y=0, z=1 ), eye=dict( x=0, y=1.0707, z=1, ) ), aspectratio=dict(x=1, y=1, z=0.7), aspectmode='manual' ), paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)' ) for d in logs: states = d.states actions = d.actions plot_reacher(states, actions) return logs
class Task(): """Problem domain to be solved by neural network. Uses OpenAI Gym patterns. """ def __init__(self, game, paramOnly=False, nReps=1, record_path="./test_videos"): """Initializes task environment Args: game - (string) - dict key of task to be solved (see domain/config.py) Optional: paramOnly - (bool) - only load parameters instead of launching task? nReps - (nReps) - number of trials to get average fitness """ # Network properties self.nInput = game.input_size self.nOutput = game.output_size self.actRange = game.h_act self.absWCap = game.weightCap self.layers = game.layers self.activations = np.r_[np.full(1, 1), game.i_act, game.o_act] # Environment self.maxEpisodeLength = game.max_episode_length self.actSelect = game.actionSelect if not paramOnly: self.env = make_env(game.env_name) if record_path: self.env_to_wrap = self.env self.env = Monitor(self.env_to_wrap, record_path, force=True) # Special needs... self.needsClosed = (game.env_name.startswith("CartPoleSwingUp")) def testInd(self, wVec, aVec, view=False, seed=-1): """Evaluate individual on task Args: wVec - (np_array) - weight matrix as a flattened vector [N**2 X 1] aVec - (np_array) - activation function of each node [N X 1] - stored as ints (see applyAct in ann.py) Optional: view - (bool) - view trial? seed - (int) - starting random seed for trials Returns: fitness - (float) - reward earned in trial """ if seed >= 0: random.seed(seed) np.random.seed(seed) self.env.seed(seed) state = self.env.reset() self.env.t = 0 annOut = act(wVec, aVec, self.nInput, self.nOutput, state) action = selectAct(annOut, self.actSelect) state, reward, done, info = self.env.step(action) if self.maxEpisodeLength == 0: return reward else: totalReward = reward for tStep in range(self.maxEpisodeLength): annOut = act(wVec, aVec, self.nInput, self.nOutput, state) action = selectAct(annOut, self.actSelect) state, reward, done, info = self.env.step(action) totalReward += reward if view: #time.sleep(0.01) if self.needsClosed: self.env.render(close=done) else: self.env.render() if done: break return totalReward # -- 'Weight Agnostic Network' evaluation -------------------------------- -- # def setWeights(self, wVec, wVal): """Set single shared weight of network Args: wVec - (np_array) - weight matrix as a flattened vector [N**2 X 1] wVal - (float) - value to assign to all weights Returns: wMat - (np_array) - weight matrix with single shared weight [N X N] """ # Create connection matrix wVec[np.isnan(wVec)] = 0 dim = int(np.sqrt(np.shape(wVec)[0])) cMat = np.reshape(wVec, (dim, dim)) cMat[cMat != 0] = 1.0 # Assign value to all weights wMat = np.copy(cMat) * wVal return wMat def getDistFitness(self, wVec, aVec, hyp, \ seed=-1,nRep=False,nVals=6,view=False,returnVals=False): """Get fitness of a single individual with distribution of weights Args: wVec - (np_array) - weight matrix as a flattened vector [N**2 X 1] aVec - (np_array) - activation function of each node [N X 1] - stored as ints (see applyAct in ann.py) hyp - (dict) - hyperparameters ['alg_wDist'] - weight distribution [standard;fixed;linspace] ['alg_absWCap'] - absolute value of highest weight for linspace Optional: seed - (int) - starting random seed for trials nReps - (int) - number of trials to get average fitness nVals - (int) - number of weight values to test Returns: fitness - (float) - mean reward over all trials """ if nRep is False: nRep = hyp['alg_nReps'] # Set weight values to test WANN with if (hyp['alg_wDist'] == "standard" ) and nVals == 6: # Double, constant, and half signal wVals = np.array((-2, -1.0, -0.5, 0.5, 1.0, 2)) else: wVals = np.linspace(-self.absWCap, self.absWCap, nVals) # Get reward from 'reps' rollouts -- test population on same seeds reward = np.empty((nRep, nVals)) for iRep in range(nRep): for iVal in range(nVals): monitor_name = "./cartpole_{}".format(iVal) self.env = Monitor(self.env_to_wrap, monitor_name) wMat = self.setWeights(wVec, wVals[iVal]) if seed == -1: reward[iRep, iVal] = self.testInd(wMat, aVec, seed=seed, view=view) else: reward[iRep, iVal] = self.testInd(wMat, aVec, seed=seed + iRep, view=view) if returnVals is True: return np.mean(reward, axis=0), wVals return np.mean(reward, axis=0)
def main(argv=None): try: options, args = getopt.getopt(sys.argv[1:], "s:x:b:u:mh", [ "step=", "max_eps=", "buffer_size=", "hidden_unit=","monitor", "help"]) except getopt.GetoptError as err: print(str(err)) print(usage.__doc__) sys.exit(1) GAME_NAME = 'CartPole-v1' AGENT_NAME = 'DQN-lr_1_e-3' MONITOR = False print_step = 10 max_eps = 500 buffer_size=1000000 hidden_unit = 16 lr=1e-3 print(options) for o, v in options: if o in ("-h", "--help"): print(usage.__doc__) sys.exit() elif o in ("-m", "--monitor"): MONITOR = True elif o in ("-s", "--step"): print_step = int(v) elif o in ("-x", "--max_eps"): max_eps = int(v) elif o in ("-b", "--buffer_size"): buffer_size = int(v) elif o in ("-u", "--hidden_unit"): hidden_unit = int(v) else: print(usage.__doc__) sys.exit() print('process game: %s\tusing agent: %s' % (GAME_NAME, AGENT_NAME)) # -------------------------------- loop for training ----------------------------- # preparing env output_dir = '%s/%s' % (GAME_NAME, AGENT_NAME) cmd = 'mkdir -p %s && mkdir -p %s/%s' % (GAME_NAME, GAME_NAME, AGENT_NAME) os.system(cmd) env = gym.make(GAME_NAME) if MONITOR: env = Monitor(env, directory=output_dir, force=True, video_callable=lambda ep: ep % 10 == 0, write_upon_reset=True, mode='training') env.seed(0) state_num = len(env.reset()) print(state_num) action_sample = env.action_space.sample() action_num = env.action_space.n if isinstance(action_sample, int) else len(action_sample) print('state_num: %d\taction_num: %d' % (state_num, action_num)) device = torch.device('cpu') agent = DQNAgent(state_num, action_num, buffer_size=buffer_size, batch_size=128, device=device, hidden_unit=hidden_unit, lr=lr) scores_window = deque(maxlen=print_step) # last 10 scores avg_scores = [] for i_episode in range(max_eps): score = 0 state = env.reset() while True: action = agent.choose_action(state) next_state, reward, done, _ = env.step(action) agent.step(state, action, reward, next_state, done) score += reward state = next_state if done: break scores_window.append(score) print('\rEpisode {}\tAverage Score: {:.2f} '.format( i_episode, np.mean(scores_window)), end="") if i_episode % print_step == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) # save model agent.save_model_params(output_dir, i_episode) avg_scores.append(np.mean(scores_window)) sys.stdout.flush() env.close()
def act(args, i, rb, q_network, lock, queues, queue, stats_queue, global_step, device, writer): env = gym.make(args.gym_id) env = wrap_atari(env) env = gym.wrappers.RecordEpisodeStatistics( env) # records episode reward in `info['episode']['r']` if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') env = wrap_pytorch( wrap_deepmind( env, clip_rewards=True, frame_stack=True, scale=False, )) env.seed(args.seed + i) env.action_space.seed(args.seed + i) # TRY NOT TO MODIFY: start the game obs = env.reset() episode_reward = 0 while global_step < (args.total_timesteps): # global_step *= args.num_actor # ALGO LOGIC: put action logic here epsilon = linear_schedule( args.start_e, args.end_e, args.exploration_fraction * args.total_timesteps, global_step) if random.random() < epsilon: action = env.action_space.sample() else: logits = q_network.forward( torch.Tensor(obs.reshape((1, ) + obs.shape))) action = torch.argmax(logits, dim=1).tolist()[0] # action = env.action_space.sample() # TRY NOT TO MODIFY: execute the game and log data. next_obs, reward, done, info = env.step(action) episode_reward += reward # TRY NOT TO MODIFY: record rewards for plotting purposes with lock: global_step += 1 if 'episode' in info.keys(): stats_queue.put((info['episode']['r'], info['episode']['l'])) # writer.add_scalar("charts/episode_reward", info['episode']['r'], global_step) # writer.add_scalar("charts/epsilon", epsilon, global_step) # ALGO LOGIC: training. rb.put((obs, action, reward, next_obs, done)) if global_step > args.learning_starts and global_step % args.train_frequency == 0: s = rb.sample(args.batch_size) queue.put([torch.Tensor(item) for item in s]) # for idx, queue in enumerate(queues): # queue.put(torch.Tensor(s[idx])) # TRY NOT TO MODIFY: CRUCIAL step easy to overlook obs = next_obs if done: # important to note that because `EpisodicLifeEnv` wrapper is applied, # the real episode reward is actually the sum of episode reward of 5 lives # which we record through `info['episode']['r']` provided by gym.wrappers.RecordEpisodeStatistics obs, episode_reward = env.reset(), 0
def act(args, experiment_name, i, worker_models, lock, rollouts_queue, stats_queue, global_step, device): actor, qf1, qf2, qf1_target, qf2_target, target_actor = worker_models env = gym.make(args.gym_id) env = gym.wrappers.RecordEpisodeStatistics( env) # records episode reward in `info['episode']['r']` if args.capture_video: if i == 0: env = Monitor(env, f'videos/{experiment_name}') env.seed(args.seed + i) env.action_space.seed(args.seed + i) # TRY NOT TO MODIFY: start the game obs = env.reset() max_action = float(env.action_space.high[0]) storage = [] episode_reward = 0 update_step = 0 while True: update_step += 1 # ALGO LOGIC: put action logic here if global_step < args.learning_starts // args.num_actors: action = env.action_space.sample() else: action = actor.forward(obs.reshape((1, ) + obs.shape)) action = (action.tolist()[0] + np.random.normal(0, max_action * args.exploration_noise, size=env.action_space.shape[0])).clip( env.action_space.low, env.action_space.high) # TRY NOT TO MODIFY: execute the game and log data. next_obs, reward, done, info = env.step(action) episode_reward += reward storage += [(obs, action, reward, next_obs, float(done))] with lock: global_step += 1 if 'episode' in info.keys(): stats_queue.put(("charts/episode_reward", info['episode']['r'], info['episode']['l'])) if len(storage) == args.actor_buffer_size: obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] for data in storage: obs_t, action, reward, obs_tp1, done = data obses_t.append(np.array(obs_t, copy=False)) actions.append(np.array(action, copy=False)) rewards.append(reward) obses_tp1.append(np.array(obs_tp1, copy=False)) dones.append(done) s_obs, s_actions, s_rewards, s_next_obses, s_dones = np.array( obses_t), np.array(actions), np.array(rewards), np.array( obses_tp1), np.array(dones) with torch.no_grad(): clipped_noise = (torch.randn_like(torch.Tensor(action)) * args.policy_noise).clamp( -args.noise_clip, args.noise_clip) next_state_actions = (target_actor.forward(s_next_obses) + clipped_noise.to(device)).clamp( env.action_space.low[0], env.action_space.high[0]) qf1_next_target = qf1_target.forward(s_next_obses, next_state_actions) qf2_next_target = qf2_target.forward(s_next_obses, next_state_actions) min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) next_q_value = torch.Tensor(s_rewards).to(device) + ( 1 - torch.Tensor(s_dones).to(device)) * args.gamma * ( min_qf_next_target).view(-1) qf1_a_values = qf1.forward( s_obs, torch.Tensor(s_actions).to(device)).view(-1) td_errors = qf1_a_values - next_q_value new_priorities = np.abs(td_errors.tolist()) + args.pr_eps rollouts_queue.put((storage, new_priorities)) storage = [] # TRY NOT TO MODIFY: CRUCIAL step easy to overlook obs = next_obs if done: # important to note that because `EpisodicLifeEnv` wrapper is applied, # the real episode reward is actually the sum of episode reward of 5 lives # which we record through `info['episode']['r']` provided by gym.wrappers.RecordEpisodeStatistics obs, episode_reward = env.reset(), 0
def run_trial(args): # tries to get agent type agent_t = args.agent if agent_t == AgentType.Testing: # tries to load a pre-trained agent configuration file config, results_dir = load_agent_config(args.results, args.trial) else: # tries to load env config from provided file path config_file = args.config_file_path config = args.default_frogger_config if config_file is None or not exists(config_file) \ else EnvironmentConfiguration.load_json(config_file) # creates env helper helper = create_helper(config) # checks for provided output dir output_dir = args.output if args.output is not None else \ get_agent_output_dir(config, agent_t, args.trial) if not exists(output_dir): makedirs(output_dir) # saves / copies configs to file config.save_json(join(output_dir, 'config.json')) helper.save_state_features(join(output_dir, 'state_features.csv')) # register environment in Gym according to env config env_id = '{}-{}-v0'.format(config.gym_env_id, args.trial) helper.register_gym_environment(env_id, False, args.fps, args.show_score_bar) # create environment and monitor env = gym.make(env_id) config.num_episodes = args.num_episodes video_callable = video_schedule(config, args.record) env = Monitor(env, directory=output_dir, force=True, video_callable=video_callable) # adds reference to monitor to allow for gym environments to update video frames if video_callable(0): env.env.monitor = env # initialize seeds (one for the environment, another for the agent) env.seed(config.seed + args.trial) agent_rng = np.random.RandomState(config.seed + args.trial) # creates the agent agent, exploration_strategy = create_agent(helper, agent_t, agent_rng) # if testing, loads tables from file (some will be filled by the agent during the interaction) if agent_t == AgentType.Testing: agent.load(results_dir) # runs episodes behavior_tracker = BehaviorTracker(config.num_episodes) recorded_episodes = [] for e in range(config.num_episodes): # checks whether to activate video monitoring env.env.monitor = env if video_callable(e) else None # reset environment old_obs = env.reset() old_s = helper.get_state_from_observation(old_obs, 0, False) if args.verbose: print(f'Episode: {e}') # helper.update_stats_episode(e) exploration_strategy.update(e) # update for learning agent t = 0 done = False while not done: # select action a = agent.act(old_s) # observe transition obs, r, done, _ = env.step(a) s = helper.get_state_from_observation(obs, r, done) r = helper.get_reward(old_s, a, r, s, done) # update agent and stats agent.update(old_s, a, r, s) behavior_tracker.add_sample(old_s, a) helper.update_stats(e, t, old_obs, obs, old_s, a, r, s) old_s = s old_obs = obs t += 1 # adds to recorded episodes list if video_callable(e): recorded_episodes.append(e) # signals new episode to tracker behavior_tracker.new_episode() # writes results to files agent.save(output_dir) behavior_tracker.save(output_dir) write_table_csv(recorded_episodes, join(output_dir, 'rec_episodes.csv')) helper.save_stats(join(output_dir, 'results'), args.clear_results) print('\nResults of trial {} written to:\n\t\'{}\''.format( args.trial, output_dir)) env.close()
params.log_dir = "../../logs/logs/self_R_DDPG/{}-mu{}".format( str(params.env_name.split("-")[0]), mu) params.actor_model_dir = "../../logs/models/self_R_DDPG/{}/actor-mu{}/".format( str(params.env_name.split("-")[0]), mu) params.critic_model_dir = "../../logs/models/self_R_DDPG/{}/critic-mu{}/".format( str(params.env_name.split("-")[0]), mu) params.video_dir = "../../logs/video/self_R/{}-mu{}".format( str(params.env_name.split("-")[0]), mu) params.plot_path = "../../logs/plots/self_R/{}-mu{}/".format( str(params.env_name.split("-")[0]), mu) env = gym.make(params.env_name) env = Monitor(env, params.video_dir) # set seed env.seed(params.seed) tf.random.set_random_seed(params.seed) replay_buffer = ReplayBuffer(params.memory_size) reward_buffer = deque(maxlen=params.reward_buffer_ep) summary_writer = tf.contrib.summary.create_file_writer(params.log_dir) # random_process = OrnsteinUhlenbeckProcess(size=env.action_space.shape[0], theta=0.15, mu=0.9, sigma=0.05) random_process = GaussianNoise(mu=params.mu, sigma=params.sigma) agent = DDPG(Actor, Critic, env.action_space.shape[0], random_process, params) get_ready(agent.params) global_timestep = tf.compat.v1.train.get_or_create_global_step() time_buffer = deque(maxlen=agent.params.reward_buffer_ep) log = logger(agent.params) action_buffer, distance_buffer, eval_epochs = list(), list(), list()
class GymTask(): """Problem domain to be solved by neural network. Uses OpenAI Gym patterns. """ def __init__(self, game, paramOnly=False, nReps=1, record=False): """Initializes task environment Args: game - (string) - dict key of task to be solved (see domain/config.py) Optional: paramOnly - (bool) - only load parameters instead of launching task? nReps - (nReps) - number of trials to get average fitness """ # Network properties self.nInput = game.input_size self.nOutput = game.output_size self.actRange = game.h_act self.absWCap = game.weightCap self.layers = game.layers self.activations = np.r_[np.full(1, 1), game.i_act, game.o_act] # Environment self.nReps = nReps self.maxEpisodeLength = game.max_episode_length self.actSelect = game.actionSelect if not paramOnly: if record: env_to_wrap = make_env(game.env_name) self.env = Monitor(env_to_wrap, "trial_recording/", force=True) else: self.env = make_env(game.env_name) # Special needs... self.needsClosed = (game.env_name.startswith("CartPoleSwingUp")) def getFitness(self, wVec, aVec, view=False, nRep=False, seed=-1): """Get fitness of a single individual. Args: wVec - (np_array) - weight matrix as a flattened vector [N**2 X 1] aVec - (np_array) - activation function of each node [N X 1] - stored as ints (see applyAct in ann.py) Optional: view - (bool) - view trial? nReps - (nReps) - number of trials to get average fitness seed - (int) - starting random seed for trials Returns: fitness - (float) - mean reward over all trials """ if nRep is False: nRep = self.nReps wVec[np.isnan(wVec)] = 0 reward = np.empty(nRep) for iRep in range(nRep): if seed > 0: seed = seed + iRep reward[iRep] = self.testInd(wVec, aVec, view=view, seed=seed) fitness = np.mean(reward) return fitness def testInd(self, wVec, aVec, hyp=None, view=False, seed=-1): """Evaluate individual on task Args: wVec - (np_array) - weight matrix as a flattened vector [N**2 X 1] aVec - (np_array) - activation function of each node [N X 1] - stored as ints (see applyAct in ann.py) Optional: view - (bool) - view trial? seed - (int) - starting random seed for trials Returns: fitness - (float) - reward earned in trial """ if seed >= 0: random.seed(seed) np.random.seed(seed) self.env.seed(seed) state = self.env.reset() self.env.t = 0 annOut = act(wVec, aVec, self.nInput, self.nOutput, state) action = selectAct(annOut, self.actSelect) state, reward, done, info = self.env.step(action) if self.maxEpisodeLength == 0: if view: if self.needsClosed: self.env.render(close=done) else: self.env.render() return reward else: totalReward = reward for tStep in range(self.maxEpisodeLength): annOut = act(wVec, aVec, self.nInput, self.nOutput, state) action = selectAct(annOut, self.actSelect) state, reward, done, info = self.env.step(action) totalReward += reward if view: if self.needsClosed: self.env.render(close=done) else: self.env.render() if done: break return totalReward
for test_mode in test_modes: # Generate environment if "_n" in args.env: env = gym.make(args.env, pairs_dict=pairs_dict, test_instr_mode=test_mode, num_dists=args.num_dists) else: env = gym.make(args.env) demo_path = os.path.join(model_path, test_mode) env = Monitor(env, demo_path, _check_log_this, force=True) env.seed(args.seed) # Define agent agent = utils.load_agent(env=env, model_name=args.model, argmax=args.argmax, env_name=args.env, instr_arch=args.instr_arch) utils.seed(args.seed) print('\n') print(f'=== EVALUATING MODE: {test_mode} ===') # Run the agent done = False action = None
device = torch.device( 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) env = wrap_atari(env) env = gym.wrappers.RecordEpisodeStatistics( env) # records episode reward in `info['episode']['r']` if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') env = wrap_pytorch( wrap_deepmind( env, clip_rewards=True, frame_stack=True, scale=False, )) env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic # respect the default timelimit assert isinstance(env.action_space, Discrete), "only discrete action space is supported" rb = ReplayBuffer(args.buffer_size) q_network = QNetwork() q_network.share_memory() target_network = QNetwork().to(device)
import gym from gym.wrappers import Monitor env = Monitor(gym.make('LunarLanderContinuous-v2'), './video', force=True, video_callable=lambda episode_id: True) env.seed(9756745635) fitness = 0 for _ in range(10): observation = env.reset() done = False while not done: ################################# x = observation[0] y = observation[1] vel_x = observation[2] vel_y = observation[3] ang = observation[4] vel_ang = observation[5] l_left = observation[6] l_right = observation[7] input = [0., 0.] ################################# input = [ max(min(i, 1.0), -1.0) for i in [(((observation[3] * -78.0698466187944) -
def runExperiment(experiment): import numpy as np from collections import deque import gym from gym.wrappers import Monitor from agents.dqnagent import DQNAgent #environment parameters gym_id = experiment["gym_id"] sliding_window_solved_score = experiment["sliding_window_solved_score"] sliding_window_score_length = experiment["sliding_window_score_length"] env_seed = experiment["env_seed"] max_episode = experiment["max_episode"] env = gym.make(gym_id) env = Monitor(env, "{}".format(experiment['folder']), video_callable=False, force=True, resume=False, write_upon_reset=False, uid=None, mode=None) env.seed(env_seed) scores = deque() sw_scores = deque(maxlen=sliding_window_score_length) #agent parameters agent_seed = experiment["agent_seed"] activation = experiment["activation"] min_episode_before_acting = experiment["min_episode_before_acting"] epsilon = experiment["epsilon"] nb_hidden_layer = experiment["nb_hidden_layer"] layer_width = experiment["layer_width"] memory_length = experiment["memory_length"] batch_size = experiment["batch_size"] agent = DQNAgent(env.observation_space, env.action_space, agent_seed, min_episode_before_acting, activation, epsilon, layer_width, nb_hidden_layer, memory_length) current_episode = 0 while (len(sw_scores) == 0 or np.mean(sw_scores) < sliding_window_solved_score) and ( max_episode == None or current_episode < max_episode): state = env.reset() current_episode += 1 reward = 0 done = False episode_score = 0 while not done: action = agent.act(state) next_state, reward, done, _ = env.step(action) agent.remember(state, action, reward, next_state, done) state = next_state episode_score += reward # if np.mean(sw_scores) > 180: # env.render() if done: scores.append(episode_score) sw_scores.append(episode_score) print( 'Episode: {}\t Epsilon: {}\t Score: {}\t Mean Score:{}\t Sliding Score:{}\t' .format(current_episode, agent.epsilon, episode_score, np.mean(scores), np.mean(sw_scores))) agent.train(batch_size=batch_size) env.close()
parser.add_argument('--sd_min', type=float, default=0.01) parser.add_argument('--sd_steps', type=int, default=50000000) parser.add_argument('--gpu_memory', type=float, default=0.1) parser.add_argument('--loss_type', type=str, default='kl') parser.add_argument('--device', type=str, default='/cpu:0') parser.add_argument('--alg', choices=['dqn','adfq'], default='dqn') parser.add_argument('--record',type=int, default=0) args = parser.parse_args() # Get the environment and extract the number of actions. env = gym.make(args.env_name) if args.record == 1: env = Monitor(env, directory=args.log_dir) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n # Next, we build our model. We use the same model that was described by Mnih et al. (2015). input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE with tf.device(args.device): model = Sequential() if K.image_dim_ordering() == 'tf': # (width, height, channels) model.add(Permute((2, 3, 1), input_shape=input_shape)) elif K.image_dim_ordering() == 'th': # (channels, width, height) model.add(Permute((1, 2, 3), input_shape=input_shape)) else: raise RuntimeError('Unknown image_dim_ordering.')
logger = logging.getLogger() logger.setLevel(logging.INFO) env = gym.make('FlappyBird-v0' if len(sys.argv)<2 else sys.argv[1]) # You provide the directory to write to (can be an existing # directory, including one with existing data -- all monitor files # will be namespaced). You can also dump to a tempdir if you'd # like: tempfile.mkdtemp(). outdir = '/tmp/random-agent-results' env = Monitor(env, directory=outdir, force=True) # This declaration must go *after* the monitor call, since the # monitor's seeding creates a new action_space instance with the # appropriate pseudorandom number generator. env.seed(0) agent = RandomAgent(env.action_space) episode_count = 100 reward = 0 done = False for i in range(episode_count): ob = env.reset() while True: action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) if done: break # Note there's no env.render() here. But the environment still can open window and
import os import torch import BCQ import DDPG import utils from gym.wrappers import Monitor # %% ENV_NAME = "LunarLanderContinuous-v2" SEED = 0 # %% env = gym.make(ENV_NAME) env = Monitor(env, 'videos/', force=True) # %% env.seed(SEED) torch.manual_seed(SEED) np.random.seed(SEED) # %% state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # %% # Loading device = torch.device("cuda" if torch.cuda.is_available() else "cpu") bcq = BCQ.BCQ(state_dim, action_dim, max_action, device, 0.99, 0.005, 0.75, 0.05) bcq.load(f"./models/bcq_{ENV_NAME}_{SEED}")