class ALGEnv(gym.Env): metadata = { 'render.modes': ['human', 'rgb_array', 'tiny_human', 'tiny_rgb_array', 'np_array'] } def __init__(self, dim_room=(10, 10), num_boxes=4, reset=True, log_interval=1000, alg_version=0, train_mode='cnn', agent_lb_path=None, agent_ub_path=None, init_probs=[0.5, 0.5, 0.5]): assert train_mode in TRAIN_MODES self.train_mode = train_mode if log_interval > 0: self.log_train_info = True else: self.log_train_info = False # 0: basic playable map # 1: playble map # 2: hardness adjustable map self.alg_version = alg_version if alg_version == 0: pass else: env_li = [ lambda: SokobanEnv(dim_room=dim_room, max_steps=50, num_boxes=num_boxes, train_mode=train_mode, log_train_info=False) ] self.soko_env = DummyVecEnv(env_li) self.agent_ub = PPO.load(agent_ub_path, env=self.soko_env) print('loaded', agent_ub_path, 'as ub') if alg_version == 2: self.agent_lb = PPO.load(agent_lb_path, env=self.soko_env) print('loaded', agent_lb_path, 'as lb') # General Configuration self.dim_room = dim_room self.num_boxes = num_boxes self.num_players = 1 # Training hyperperams self.max_prefer_subs = dim_room[0] * dim_room[1] // 2 self.place_target_prob = init_probs[0] self.place_box_prob = init_probs[1] self.place_player_prob = init_probs[2] # Log info self.start_time = time.time() self.train_result_summary = {-1: 0, 0: 0, 1: 0, 2: 0} self.fail_type_summary = {-1: 0, 0: 0, 1: 0, 2: 0} # self.sample_map = False self.episode_reward = 0 self.total_reward_per_log_interval = 0 self.total_steps_per_log_interval = 0 self.total_subs_per_log_interval = 0 self.log_interval = log_interval self.reseted = False self.train_counter = 0 # Env properties self.map = None # Penalties and Rewards self.penalty_sub_wrong_tile = -5 self.penalty_exc_btp_tiles = -10 self.penalty_bad_map_design = -50 self.penalty_generation_fail = -50 self.penalty_exc_subs = -10 self.reward_neighbor_valid_tiles = 2 self.reward_place_btp_tiles = 5 self.reward_basic_playable = 40 if alg_version == 1: # too hard or unsolvable self.penalty_agent_ub_thou = -30 self.reward_agent_ub_solvable = 50 elif alg_version == 2: self.penalty_agent_lb_solvable = -30 self.penalty_agent_ub_thou = -30 self.reward_agent_ub_solvable = 10 self.reward_agent_lb_thou = 50 # Generation Track self.placed_player = 0 self.placed_boxes = 0 self.placed_target = 0 self.env_steps = 0 # Env Settings self.viewer = None self.max_steps = dim_room[0] * dim_room[1] self.action_space = MultiDiscrete([dim_room[0], dim_room[1], 5]) if train_mode == 'cnn': self.scale = 6 screen_height, screen_width = (dim_room[0] * self.scale, dim_room[1] * self.scale) self.observation_space = Box(low=0, high=255, shape=(screen_height, screen_width, 3), dtype=np.uint8) else: self.observation_space = Box(low=0, high=6, shape=(dim_room[0], dim_room[1]), dtype=np.uint8) if reset: # Initialize Room _ = self.reset() def random_init_map(self): room = np.zeros((self.dim_room[0], self.dim_room[1]), dtype=np.uint8) for _ in range(self.num_boxes): if np.random.rand(1) < self.place_target_prob: x, y = np.random.randint(1, self.dim_room[0] - 1, size=2) room[x, y] = 2 if np.random.rand(1) < self.place_box_prob: x, y = np.random.randint(1, self.dim_room[0] - 1, size=2) room[x, y] = 4 for _ in range(self.num_players): if np.random.rand(1) < self.place_player_prob: x, y = np.random.randint(1, self.dim_room[0] - 1, size=2) room[x, y] = 5 self.placed_target += np.count_nonzero(room == 2) self.placed_boxes += np.count_nonzero(room == 4) self.placed_player += np.count_nonzero(room == 5) return room def reset(self): self.placed_player = 0 self.placed_boxes = 0 self.placed_target = 0 self.map = self.random_init_map() self.env_steps = 0 self.episode_subs = 0 self.episode_reward = 0 self.reseted = True if self.train_mode == 'cnn': starting_observation = self.render('tiny_rgb_array', scale=self.scale) else: starting_observation = self.render('np_array') return starting_observation def soko_agent_test(self): reward = 0 # v1 if self.alg_version == 1: done = False obs = self.soko_env.env_method('manual_reset', self.map) while not done: action, _ = self.agent_ub.predict(obs, deterministic=True) obs, _, done, info = self.soko_env.step(action) # agent_ub solvable if info[0]["all_boxes_on_target"]: reward += self.reward_agent_ub_solvable train_result = 0 # good map else: reward += self.penalty_agent_ub_thou train_result = 2 # thou map # v2 else: done = False obs = self.soko_env.env_method('manual_reset', self.map) while not done: action, _ = self.agent_ub.predict(obs, deterministic=True) obs, _, done, info = self.soko_env.step(action) # agent_ub thou if not info[0]["all_boxes_on_target"]: reward += self.penalty_agent_ub_thou train_result = 2 # thou # agent_ub solvable else: reward += self.reward_agent_ub_solvable done = False obs = self.soko_env.env_method('manual_reset', self.map) while not done: action, _ = self.agent_lb.predict(obs, deterministic=True) obs, _, done, info = self.soko_env.step(action) # agent_lb solvable if info[0]["all_boxes_on_target"]: reward += self.penalty_agent_lb_solvable train_result = 1 # too easy else: reward += self.reward_agent_lb_thou train_result = 0 # good map return reward, train_result def step(self, action): ''' Tile type: 0: Wall 1: Floor 2: Target 3: Box On Target 4: Box 5: Player 6: Player On Target act: 0: Finish Generation 1: Floor 2: Box Target 3: Box 4: Player ''' x, y, act = action reward = 0 done = False self.env_steps += 1 # not finish generation if act != 0: if self.map[x][y] != 0: reward += self.penalty_sub_wrong_tile # is wall tile, can substitute else: for (_x, _y) in [(x - 1, y), (x, y - 1), (x, y + 1), (x + 1, y)]: if _x in range(self.dim_room[0]) and _y in range( self.dim_room[1]): if self.map[_x, _y] != 0: reward += self.reward_neighbor_valid_tiles if act == 1: self.map[x][y] = 1 self.episode_subs += 1 if self.episode_subs >= self.max_prefer_subs: reward += self.penalty_exc_subs # print(self.episode_subs) # place box target elif act == 2: if self.placed_target >= self.num_boxes: reward += self.penalty_exc_btp_tiles else: self.placed_target += 1 self.map[x][y] = 2 self.episode_subs += 1 reward += self.reward_place_btp_tiles if self.episode_subs >= self.max_prefer_subs: reward += self.penalty_exc_subs # print(self.episode_subs) # place box elif act == 3: if self.placed_boxes >= self.num_boxes: reward += self.penalty_exc_btp_tiles else: self.placed_boxes += 1 self.map[x][y] = 4 self.episode_subs += 1 reward += self.reward_place_btp_tiles if self.episode_subs >= self.max_prefer_subs: reward += self.penalty_exc_subs # print(self.episode_subs) # place player elif act == 4: if self.placed_player >= self.num_players: reward += self.penalty_exc_btp_tiles else: self.placed_player += 1 self.map[x][y] = 5 self.episode_subs += 1 reward += self.reward_place_btp_tiles if self.episode_subs >= self.max_prefer_subs: reward += self.penalty_exc_subs # print(self.episode_subs) if self.is_maxsteps(): done = True # finished generation else: done = True if done: _train_result = -1 # not used for training _fail_type = -1 # not failed if (self.placed_player != self.num_players or self.placed_boxes != self.num_boxes or self.placed_target != self.num_boxes): reward += self.penalty_generation_fail _fail_type = 0 # wrong number btp tiles else: if not self.basic_playable(self.map): reward += self.penalty_bad_map_design _fail_type = 1 # not basic playable else: reward += self.reward_basic_playable if self.alg_version == 0: _train_result = 0 else: _train_reward, _train_result = self.soko_agent_test() reward += _train_reward self.episode_reward += reward # Convert the observation to RGB frame if self.train_mode == 'cnn': observation = self.render(mode='tiny_rgb_array', scale=self.scale) else: observation = self.render(mode='np_array') info = { "coordinate": (x, y), "action": act, "curr_steps": self.env_steps, } if self.reseted: self.reseted = False self.train_counter += 1 if done: info["total_steps"] = self.env_steps info["train_result"] = _train_result info['fail_type'] = _fail_type self.train_result_summary[_train_result] += 1 self.fail_type_summary[_fail_type] += 1 self.total_reward_per_log_interval += self.episode_reward self.total_steps_per_log_interval += self.env_steps self.total_subs_per_log_interval += self.episode_subs # if _fail_type == -1 and self.sample_map: # print('Sample map:') # print(self.map) # print('*********************************************') # self.sample_map = False if self.log_train_info and self.train_counter % self.log_interval == 0: end_time = time.time() duration = end_time - self.start_time avg_reward = self.total_reward_per_log_interval / self.log_interval avg_steps = self.total_steps_per_log_interval / self.log_interval avg_subs = self.total_subs_per_log_interval / self.log_interval print('[{}] Summary'.format(self.train_counter)) print('Duration: %.2fs' % (duration)) print('Average reward current log interval: ', avg_reward) print('Average steps current log interval: ', avg_steps) print('Average subs current log interval: ', avg_subs) print('Good Map :', self.train_result_summary[0]) if self.alg_version == 2: print('Too easy map :', self.train_result_summary[1]) if self.alg_version != 0: print('Too hard or unsolvable map:', self.train_result_summary[2]) print('Not for training map :', self.train_result_summary[-1]) print('Generated wrong number of btp tiles:', self.fail_type_summary[0]) print('Generated not basic playable map :', self.fail_type_summary[1]) print('Unable to finish by max step :', self.fail_type_summary[2]) print('Succeeded generate map for training:', self.fail_type_summary[-1]) print('*********************************************') self.total_reward_per_log_interval = 0 self.total_steps_per_log_interval = 0 self.total_subs_per_log_interval = 0 self.train_result_summary = {-1: 0, 0: 0, 1: 0, 2: 0} self.fail_type_summary = {-1: 0, 0: 0, 1: 0, 2: 0} self.sample_map = True self.start_time = time.time() return observation, reward, done, info def render(self, mode=None, close=None, scale=16): if mode is None: if self.train_mode == 'cnn': mode = 'human' else: mode = 'np_array' assert mode in RENDERING_MODES if 'rgb_array' in mode: img = self.get_image(mode, scale) return img elif 'np_array' in mode: return self.map elif 'human' in mode: from gym.envs.classic_control import rendering if self.viewer is None or not self.viewer.isopen: self.viewer = rendering.SimpleImageViewer() img = self.get_image(mode, scale) self.viewer.imshow(img) return self.viewer.isopen else: super(ALGEnv, self).render(mode=mode) # just raise an exception def get_image(self, mode, scale=1): if mode.startswith('tiny_'): img = room_to_tiny_world_rgb(self.map, scale=scale) else: img = room_to_rgb(self.map) return img def basic_playable(self, room): # # player can reach all boxes and all targets # for player_coord in np.argwhere(room==5): # des = np.concatenate((np.argwhere(room==2), np.argwhere(room==4)), axis=0) # if not self.contaminate(room, player_coord, des): # return False # player can reach all none wall tiles if not self.contaminate_room(room): return False # no three walls around any box if self.box_stuck(room): return False return True def box_stuck(self, room): room = deepcopy(room) room = np.pad(room, 1, 'constant', constant_values=0) for (x, y) in np.argwhere(room == 4): if (room[x - 1, y] == room[x, y - 1] == 0 or room[x - 1, y] == room[x, y + 1] == 0 or room[x + 1, y] == room[x, y - 1] == 0 or room[x + 1, y] == room[x, y - 1] == 0): return True num_wall = 0 for (_x, _y) in [(x - 1, y), (x, y - 1), (x, y + 1), (x + 1, y)]: if room[_x, _y] == 0: num_wall += 1 if num_wall >= 3: return True return False # player can reach any none wall tile within room def contaminate_room(self, room): room = deepcopy(room) room = np.pad(room, 1, 'constant', constant_values=0) (x, y) = np.argwhere(room == 5)[0] room[room != 0] = 1 room[x, y] = 5 fixpoint = False while not fixpoint: fixpoint = True for (x, y) in np.argwhere(room == 5): for (_x, _y) in [(x - 1, y), (x, y - 1), (x, y + 1), (x + 1, y)]: if room[_x, _y] not in [0, 5]: room[_x, _y] = 5 fixpoint = False for i in [1, 2, 4]: if i in room: return False return True def contaminate(self, room, src, des): room = deepcopy(room) (x, y) = src src_tile = room[x, y] room[room != 0] = 1 room[x, y] = src_tile fixpoint = False while not fixpoint: fixpoint = True for (x, y) in np.argwhere(room == src_tile): for (_x, _y) in [(x - 1, y), (x, y - 1), (x, y + 1), (x + 1, y)]: if _x in range(self.dim_room[0]) and _y in range( self.dim_room[1]): if room[_x, _y] not in [0, src_tile]: room[_x, _y] = src_tile fixpoint = False reachable = True for (x, y) in des: if room[x, y] != src_tile: reachable = False break return reachable def is_maxsteps(self): return self.env_steps >= self.max_steps def deconstruct_map(self, obs_map): state_map = copy.deepcopy(obs_map) fix_map = copy.deepcopy(obs_map) state_map[state_map == 6] = 5 fix_map[(fix_map == 3) | (fix_map == 6)] = 2 fix_map[(fix_map == 4) | (fix_map == 5)] = 1 return fix_map, state_map def assemble_map(self, state_map, fix_map): obs_map = copy.deepcopy(state_map) obs_map[(obs_map == 5) & (fix_map == 2)] = 6 return obs_map def close(self): if self.viewer is not None: self.viewer.close() def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed]
env = ProbsVisualizationWrapper(env) env = Monitor(env, f'videos/{experiment_name}') env = wrap_pytorch( wrap_deepmind( env, clip_rewards=True, frame_stack=True, scale=False, ) ) env.seed(seed) env.action_space.seed(seed) env.observation_space.seed(seed) return env return thunk envs = VecPyTorch(DummyVecEnv([make_env(args.gym_id, args.seed+i, i) for i in range(args.num_envs)]), device) # if args.prod_mode: # envs = VecPyTorch( # SubprocVecEnv([make_env(args.gym_id, args.seed+i, i) for i in range(args.num_envs)], "fork"), # device # ) assert isinstance(envs.action_space, Discrete), "only discrete action space is supported" # ALGO LOGIC: initialize agent here: class Scale(nn.Module): def __init__(self, scale): super().__init__() self.scale = scale def forward(self, x): return x * self.scale
def test_save_load(tmp_path, model_class): """ Test if 'save' and 'load' saves and loads model correctly and if 'get_parameters' and 'set_parameters' and work correctly. ''warning does not test function of optimizer parameter load :param model_class: (BaseAlgorithm) A RL model """ env = DummyVecEnv([lambda: select_env(model_class)]) policy_kwargs = dict(net_arch=[16]) if model_class in {QRDQN, TQC}: policy_kwargs.update(dict(n_quantiles=20)) # create model model = model_class("MlpPolicy", env, verbose=1, policy_kwargs=policy_kwargs) model.learn(total_timesteps=300) env.reset() observations = np.concatenate( [env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0) # Get parameters of different objects # deepcopy to avoid referencing to tensors we are about to modify original_params = deepcopy(model.get_parameters()) # Test different error cases of set_parameters. # Test that invalid object names throw errors invalid_object_params = deepcopy(original_params) invalid_object_params[ "I_should_not_be_a_valid_object"] = "and_I_am_an_invalid_tensor" with pytest.raises(ValueError): model.set_parameters(invalid_object_params, exact_match=True) with pytest.raises(ValueError): model.set_parameters(invalid_object_params, exact_match=False) # Test that exact_match catches when something was missed. missing_object_params = dict( (k, v) for k, v in list(original_params.items())[:-1]) with pytest.raises(ValueError): model.set_parameters(missing_object_params, exact_match=True) # Test that exact_match catches when something inside state-dict # is missing but we have exact_match. missing_state_dict_tensor_params = {} for object_name in original_params: object_params = {} missing_state_dict_tensor_params[object_name] = object_params # Skip last item in state-dict for k, v in list(original_params[object_name].items())[:-1]: object_params[k] = v with pytest.raises(RuntimeError): # PyTorch load_state_dict throws RuntimeError if strict but # invalid state-dict. model.set_parameters(missing_state_dict_tensor_params, exact_match=True) # Test that parameters do indeed change. random_params = {} for object_name, params in original_params.items(): # Do not randomize optimizer parameters (custom layout) if "optim" in object_name: random_params[object_name] = params else: # Again, skip the last item in state-dict random_params[object_name] = OrderedDict( (param_name, th.rand_like(param)) for param_name, param in list(params.items())[:-1]) # Update model parameters with the new random values model.set_parameters(random_params, exact_match=False) new_params = model.get_parameters() # Check that all params except the final item in each state-dict are different. for object_name in original_params: # Skip optimizers (no valid comparison with just th.allclose) if "optim" in object_name: continue # state-dicts use ordered dictionaries, so key order # is guaranteed. last_key = list(original_params[object_name].keys())[-1] for k in original_params[object_name]: if k == last_key: # Should be same as before assert th.allclose( original_params[object_name][k], new_params[object_name][k] ), "Parameter changed despite not included in the loaded parameters." else: # Should be different assert not th.allclose( original_params[object_name][k], new_params[object_name] [k]), "Parameters did not change as expected." params = new_params # get selected actions selected_actions, _ = model.predict(observations, deterministic=True) # Check model.save(tmp_path / "test_save.zip") del model # Check if the model loads as expected for every possible choice of device: for device in ["auto", "cpu", "cuda"]: model = model_class.load(str(tmp_path / "test_save.zip"), env=env, device=device) # check if the model was loaded to the correct device assert model.device.type == get_device(device).type assert model.policy.device.type == get_device(device).type # check if params are still the same after load new_params = model.get_parameters() # Check that all params are the same as before save load procedure now for object_name in new_params: # Skip optimizers (no valid comparison with just th.allclose) if "optim" in object_name: continue for key in params[object_name]: assert new_params[object_name][key].device.type == get_device( device).type assert th.allclose( params[object_name][key].to("cpu"), new_params[object_name][key].to("cpu") ), "Model parameters not the same after save and load." # check if model still selects the same actions new_selected_actions, _ = model.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # check if learn still works model.learn(total_timesteps=300) del model # clear file from os os.remove(tmp_path / "test_save.zip")
def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) if args.stats_path is None: envs = VecNormalize(envs, norm_obs=True, clip_obs=np.inf, norm_reward=False, clip_reward=np.inf) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) print("Do random explorations to build running averages") envs.reset() for _ in tqdm(range(1000)): random_action = np.stack( [envs.action_space.sample() for _ in range(n_envs)]) envs.step(random_action) envs.training = False # freeze the running averages (what a terrible variable name...) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs, device=args.device) learner.learn(total_timesteps=args.total_timesteps, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs, device=args.device, target_kl=2e-2) if args.device == 'cpu': torch.cuda.empty_cache() learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def run_ensemble_strategy(self, A2C_model_kwargs, PPO_model_kwargs, DDPG_model_kwargs, timesteps_dict): """Ensemble Strategy that combines PPO, A2C and DDPG""" print("============Start Ensemble Strategy============") # for ensemble model, it's necessary to feed the last state # of the previous model to the current model as the initial state last_state_ensemble = [] ppo_sharpe_list = [] ddpg_sharpe_list = [] a2c_sharpe_list = [] model_use = [] validation_start_date_list = [] validation_end_date_list = [] iteration_list = [] insample_turbulence = self.df[(self.df.date < self.train_period[1]) & (self.df.date >= self.train_period[0])] insample_turbulence_threshold = np.quantile( insample_turbulence.turbulence.values, .90) start = time.time() for i in range(self.rebalance_window + self.validation_window, len(self.unique_trade_date), self.rebalance_window): validation_start_date = self.unique_trade_date[ i - self.rebalance_window - self.validation_window] validation_end_date = self.unique_trade_date[i - self.rebalance_window] validation_start_date_list.append(validation_start_date) validation_end_date_list.append(validation_end_date) iteration_list.append(i) print("============================================") ## initial state is empty if i - self.rebalance_window - self.validation_window == 0: # inital state initial = True else: # previous state initial = False # Tuning trubulence index based on historical data # Turbulence lookback window is one quarter (63 days) end_date_index = self.df.index[ self.df["date"] == self.unique_trade_date[ i - self.rebalance_window - self.validation_window]].to_list()[-1] start_date_index = end_date_index - 63 + 1 historical_turbulence = self.df.iloc[start_date_index:( end_date_index + 1), :] historical_turbulence = historical_turbulence.drop_duplicates( subset=['date']) historical_turbulence_mean = np.mean( historical_turbulence.turbulence.values) print(historical_turbulence_mean) if historical_turbulence_mean > insample_turbulence_threshold: # if the mean of the historical data is greater than the 90% quantile of insample turbulence data # then we assume that the current market is volatile, # therefore we set the 90% quantile of insample turbulence data as the turbulence threshold # meaning the current turbulence can't exceed the 90% quantile of insample turbulence data turbulence_threshold = insample_turbulence_threshold else: # if the mean of the historical data is less than the 90% quantile of insample turbulence data # then we tune up the turbulence_threshold, meaning we lower the risk turbulence_threshold = np.quantile( insample_turbulence.turbulence.values, 1) print("turbulence_threshold: ", turbulence_threshold) ############## Environment Setup starts ############## ## training env train = data_split( self.df, start=self.train_period[0], end=self.unique_trade_date[i - self.rebalance_window - self.validation_window]) self.train_env = DummyVecEnv([ lambda: StockTradingEnv(train, self.stock_dim, self.hmax, self.initial_amount, self.buy_cost_pct, self.sell_cost_pct, self.reward_scaling, self.state_space, self.action_space, self.tech_indicator_list, print_verbosity=self.print_verbosity) ]) validation = data_split( self.df, start=self.unique_trade_date[i - self.rebalance_window - self.validation_window], end=self.unique_trade_date[i - self.rebalance_window]) ############## Environment Setup ends ############## ############## Training and Validation starts ############## print( "======Model training from: ", self.train_period[0], "to ", self.unique_trade_date[i - self.rebalance_window - self.validation_window]) # print("training: ",len(data_split(df, start=20090000, end=test.datadate.unique()[i-rebalance_window]) )) # print("==============Model Training===========") print("======A2C Training========") model_a2c = self.get_model("a2c", self.train_env, policy="MlpPolicy", model_kwargs=A2C_model_kwargs) model_a2c = self.train_model( model_a2c, "a2c", tb_log_name="a2c_{}".format(i), iter_num=i, total_timesteps=timesteps_dict['a2c']) #100_000 print("======A2C Validation from: ", validation_start_date, "to ", validation_end_date) val_env_a2c = DummyVecEnv([ lambda: StockTradingEnv(validation, self.stock_dim, self.hmax, self.initial_amount, self.buy_cost_pct, self.sell_cost_pct, self.reward_scaling, self.state_space, self.action_space, self.tech_indicator_list, turbulence_threshold= turbulence_threshold, iteration=i, model_name='A2C', mode='validation', print_verbosity=self.print_verbosity) ]) val_obs_a2c = val_env_a2c.reset() self.DRL_validation(model=model_a2c, test_data=validation, test_env=val_env_a2c, test_obs=val_obs_a2c) sharpe_a2c = self.get_validation_sharpe(i, model_name="A2C") print("A2C Sharpe Ratio: ", sharpe_a2c) print("======PPO Training========") model_ppo = self.get_model("ppo", self.train_env, policy="MlpPolicy", model_kwargs=PPO_model_kwargs) model_ppo = self.train_model( model_ppo, "ppo", tb_log_name="ppo_{}".format(i), iter_num=i, total_timesteps=timesteps_dict['ppo']) #100_000 print("======PPO Validation from: ", validation_start_date, "to ", validation_end_date) val_env_ppo = DummyVecEnv([ lambda: StockTradingEnv(validation, self.stock_dim, self.hmax, self.initial_amount, self.buy_cost_pct, self.sell_cost_pct, self.reward_scaling, self.state_space, self.action_space, self.tech_indicator_list, turbulence_threshold= turbulence_threshold, iteration=i, model_name='PPO', mode='validation', print_verbosity=self.print_verbosity) ]) val_obs_ppo = val_env_ppo.reset() self.DRL_validation(model=model_ppo, test_data=validation, test_env=val_env_ppo, test_obs=val_obs_ppo) sharpe_ppo = self.get_validation_sharpe(i, model_name="PPO") print("PPO Sharpe Ratio: ", sharpe_ppo) print("======DDPG Training========") model_ddpg = self.get_model("ddpg", self.train_env, policy="MlpPolicy", model_kwargs=DDPG_model_kwargs) model_ddpg = self.train_model( model_ddpg, "ddpg", tb_log_name="ddpg_{}".format(i), iter_num=i, total_timesteps=timesteps_dict['ddpg']) #50_000 print("======DDPG Validation from: ", validation_start_date, "to ", validation_end_date) val_env_ddpg = DummyVecEnv([ lambda: StockTradingEnv(validation, self.stock_dim, self.hmax, self.initial_amount, self.buy_cost_pct, self.sell_cost_pct, self.reward_scaling, self.state_space, self.action_space, self.tech_indicator_list, turbulence_threshold= turbulence_threshold, iteration=i, model_name='DDPG', mode='validation', print_verbosity=self.print_verbosity) ]) val_obs_ddpg = val_env_ddpg.reset() self.DRL_validation(model=model_ddpg, test_data=validation, test_env=val_env_ddpg, test_obs=val_obs_ddpg) sharpe_ddpg = self.get_validation_sharpe(i, model_name="DDPG") ppo_sharpe_list.append(sharpe_ppo) a2c_sharpe_list.append(sharpe_a2c) ddpg_sharpe_list.append(sharpe_ddpg) print("======Best Model Retraining from: ", self.train_period[0], "to ", self.unique_trade_date[i - self.rebalance_window]) # Environment setup for model retraining up to first trade date train_full = data_split( self.df, start=self.train_period[0], end=self.unique_trade_date[i - self.rebalance_window]) self.train_full_env = DummyVecEnv([ lambda: StockTradingEnv(train_full, self.stock_dim, self.hmax, self.initial_amount, self.buy_cost_pct, self.sell_cost_pct, self.reward_scaling, self.state_space, self.action_space, self.tech_indicator_list, print_verbosity=self.print_verbosity) ]) # Model Selection based on sharpe ratio if (sharpe_ppo >= sharpe_a2c) & (sharpe_ppo >= sharpe_ddpg): model_use.append('PPO') model_ensemble = self.get_model("ppo", self.train_full_env, policy="MlpPolicy", model_kwargs=PPO_model_kwargs) model_ensemble = self.train_model( model_ensemble, "ensemble", tb_log_name="ensemble_{}".format(i), iter_num=i, total_timesteps=timesteps_dict['ppo']) #100_000 elif (sharpe_a2c > sharpe_ppo) & (sharpe_a2c > sharpe_ddpg): model_use.append('A2C') model_ensemble = self.get_model("a2c", self.train_full_env, policy="MlpPolicy", model_kwargs=A2C_model_kwargs) model_ensemble = self.train_model( model_ensemble, "ensemble", tb_log_name="ensemble_{}".format(i), iter_num=i, total_timesteps=timesteps_dict['a2c']) #100_000 else: model_use.append('DDPG') model_ensemble = self.get_model("ddpg", self.train_full_env, policy="MlpPolicy", model_kwargs=DDPG_model_kwargs) model_ensemble = self.train_model( model_ensemble, "ensemble", tb_log_name="ensemble_{}".format(i), iter_num=i, total_timesteps=timesteps_dict['ddpg']) #50_000 ############## Training and Validation ends ############## ############## Trading starts ############## print("======Trading from: ", self.unique_trade_date[i - self.rebalance_window], "to ", self.unique_trade_date[i]) #print("Used Model: ", model_ensemble) last_state_ensemble = self.DRL_prediction( model=model_ensemble, name="ensemble", last_state=last_state_ensemble, iter_num=i, turbulence_threshold=turbulence_threshold, initial=initial) ############## Trading ends ############## end = time.time() print("Ensemble Strategy took: ", (end - start) / 60, " minutes") df_summary = pd.DataFrame([ iteration_list, validation_start_date_list, validation_end_date_list, model_use, a2c_sharpe_list, ppo_sharpe_list, ddpg_sharpe_list ]).T df_summary.columns = [ 'Iter', 'Val Start', 'Val End', 'Model Used', 'A2C Sharpe', 'PPO Sharpe', 'DDPG Sharpe' ] return df_summary
seed=args.seed, tensorboard_log=args.tensorboard) #--------------------------------------------------------# #-------------------------ERROR?-------------------------# else: raise RuntimeError('Algorithm specified is not registered.') #--------------------------------------------------------# # Calculating n_timesteps_episode for training n_timesteps_episode = env.simulator._eplus_one_epi_len / \ env.simulator._eplus_run_stepsize timesteps = args.episodes * n_timesteps_episode # For callbacks processing env_vec = DummyVecEnv([lambda: env]) # Using Callbacks for training callbacks = [] # Set up Evaluation and saving best model if args.evaluation: eval_callback = LoggerEvalCallback( env_vec, best_model_save_path='best_model/' + name + '/', log_path='best_model/' + name + '/', eval_freq=n_timesteps_episode * args.eval_freq, deterministic=True, render=False, n_eval_episodes=args.eval_length) callbacks.append(eval_callback)
def act( flags, actor_index: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers, initial_agent_state_buffers, ): try: logging.info("Actor %i started.", actor_index) timings = Timings() # Keep track of how fast things are. gym_env = create_env(flags) seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little") gym_env.seed(seed) env = Environment(gym_env) def make_env(flags): def thunk(): env = create_env(flags) return env return thunk envs = DummyVecEnv([make_env(flags) for i in range(1)]) env_output = env.initial() envs.reset() agent_state = model.initial_state(batch_size=1) agent_output, unused_state = model(env_output, agent_state) while True: index = free_queue.get() if index is None: break # Write old rollout end. for key in env_output: buffers[key][index][0, ...] = env_output[key] for key in agent_output: buffers[key][index][0, ...] = agent_output[key] for i, tensor in enumerate(agent_state): initial_agent_state_buffers[index][i][...] = tensor # Do new rollout. for t in range(flags.unroll_length): timings.reset() with torch.no_grad(): agent_output, agent_state = model(env_output, agent_state) # timings.time("model") env_output = env.step(agent_output["action"]) # env_output = env.step(agent_output["action"]) # envs.step((torch.randint(0, envs.action_space.n, (envs.num_envs,))).numpy()) assert agent_output["action"] == env_output["last_action"] timings.time("step") for key in env_output: buffers[key][index][t + 1, ...] = env_output[key] for key in agent_output: buffers[key][index][t + 1, ...] = agent_output[key] timings.time("write") full_queue.put(index) if actor_index == 0: logging.info("Actor %i: %s", actor_index, timings.summary()) except KeyboardInterrupt: pass # Return silently. except Exception as e: logging.error("Exception in worker process %i", actor_index) traceback.print_exc() print() raise e
def __init__(self, policy: Type[BasePolicy], env: Union[GymEnv, str], policy_base: Type[BasePolicy], learning_rate: Union[float, Callable], policy_kwargs: Dict[str, Any] = None, verbose: int = 0, device: Union[th.device, str] = 'auto', support_multi_env: bool = False, create_eval_env: bool = False, monitor_wrapper: bool = True, seed: Optional[int] = None, use_sde: bool = False, sde_sample_freq: int = -1): if isinstance(policy, str) and policy_base is not None: self.policy_class = get_policy_from_name(policy_base, policy) else: self.policy_class = policy self.device = get_device(device) if verbose > 0: print(f"Using {self.device} device") self.env = None # type: Optional[GymEnv] # get VecNormalize object if needed self._vec_normalize_env = unwrap_vec_normalize(env) self.verbose = verbose self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs self.observation_space = None # type: Optional[gym.spaces.Space] self.action_space = None # type: Optional[gym.spaces.Space] self.n_envs = None self.num_timesteps = 0 self.eval_env = None self.seed = seed self.action_noise = None # type: Optional[ActionNoise] self.start_time = None self.policy = None self.learning_rate = learning_rate self.lr_schedule = None # type: Optional[Callable] self._last_obs = None # type: Optional[np.ndarray] # When using VecNormalize: self._last_original_obs = None # type: Optional[np.ndarray] self._episode_num = 0 # Used for gSDE only self.use_sde = use_sde self.sde_sample_freq = sde_sample_freq # Track the training progress (from 1 to 0) # this is used to update the learning rate self._current_progress = 1 # Buffers for logging self.ep_info_buffer = None # type: Optional[deque] self.ep_success_buffer = None # type: Optional[deque] # For logging self._n_updates = 0 # type: int # Create and wrap the env if needed if env is not None: if isinstance(env, str): if create_eval_env: eval_env = gym.make(env) if monitor_wrapper: eval_env = Monitor(eval_env, filename=None) self.eval_env = DummyVecEnv([lambda: eval_env]) if self.verbose >= 1: print( "Creating environment from the given name, wrapped in a DummyVecEnv." ) env = gym.make(env) if monitor_wrapper: env = Monitor(env, filename=None) env = DummyVecEnv([lambda: env]) env = self._wrap_env(env) self.observation_space = env.observation_space self.action_space = env.action_space self.n_envs = env.num_envs self.env = env if not support_multi_env and self.n_envs > 1: raise ValueError( "Error: the model does not support multiple envs requires a single vectorized" " environment.")
def test_sync_vec_normalize(make_env): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) assert isinstance(unwrap_vec_normalize(env), VecNormalize) if not isinstance(env.observation_space, spaces.Dict): env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) if not isinstance(env.observation_space, spaces.Dict): eval_env = VecFrameStack(eval_env, 1) env.seed(0) env.action_space.seed(0) env.reset() # Initialize running mean latest_reward = None for _ in range(100): _, latest_reward, _, _ = env.step([env.action_space.sample()]) # Check that unnormalized reward is same as original reward original_latest_reward = env.get_original_reward() assert np.allclose(original_latest_reward, env.unnormalize_reward(latest_reward)) obs = env.reset() dummy_rewards = np.random.rand(10) original_obs = env.get_original_obs() # Check that unnormalization works assert allclose(original_obs, env.unnormalize_obs(obs)) # Normalization must be different (between different environments) assert not allclose(obs, eval_env.normalize_obs(original_obs)) # Test syncing of parameters sync_envs_normalization(env, eval_env) # Now they must be synced assert allclose(obs, eval_env.normalize_obs(original_obs)) assert allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def test(seed, model_filename, vec_filename, train, test, test_as_class=0, render=False, save_file="default.yml"): global g_step, g_fMRI_data print("Testing:") total_rewards = [] distance_xs = [] if True: g_step = 0 g_fMRI_data = np.zeros(shape=[args.test_steps, 256], dtype=np.float32) print(f" Seed {seed}, model {model_filename} vec {vec_filename}") print( f" Train on {train}, test on {test}, w/ bodyinfo {test_as_class}") if test_as_class >= 0: bodyinfo = test_as_class else: if args.with_bodyinfo: bodyinfo = test // 100 else: bodyinfo = 0 eval_env = utils.make_env(render=render, robot_body=test, body_info=bodyinfo) eval_env = DummyVecEnv([eval_env]) if args.vec_normalize: eval_env = VecNormalize.load(vec_filename, eval_env) eval_env.norm_reward = False eval_env.seed(seed) model = PPO.load(model_filename) obs = eval_env.reset() if render: # eval_env.env_method("set_view") print( "\n\nWait for a while, so I have the time to press Ctrl+F11 to enter FullScreen Mode.\n\n" ) time.sleep( 3 ) # Wait for a while, so I have the time to press Ctrl+F11 to enter FullScreen Mode. distance_x = 0 # print(obs) total_reward = 0 for step in tqdm(range(args.test_steps)): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_env.step(action) if render: eval_env.envs[0].camera_adjust() (width, height, rgbPixels, _, _) = eval_env.envs[0].env.env._p.getCameraImage( 1920, 1080, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL) image = rgbPixels[:, :, :3] image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) cv2.imwrite( f"{folder}/fMRI_videos/getCameraImage_b{test}_s{seed}_{step:05}.png", image) if done: # it should not matter if the env reset. I guess... # break pass else: # the last observation will be after reset, so skip the last distance_x = eval_env.envs[0].robot.body_xyz[0] total_reward += reward[0] # if render: # time.sleep(0.01) eval_env.close() print( f"train {train}, test {test}, test_as_class {test_as_class}, step {step}, total_reward {total_reward}, distance_x {distance_x}" ) if args.save_fmri: base_fMRI_data = None sorted_data = g_fMRI_data.copy() if test != 0 or seed != 0: # if sorted_arg exists, use the existing one # because we want to compare the patterns of two experiments sorted_arg = np.load(f"{folder}/sorted_arg.npy") base_fMRI_data = np.load(f"{folder}/base_fMRI_data.npy") else: sorted_arg = np.argsort(np.mean(sorted_data, axis=0)) np.save(f"{folder}/sorted_arg.npy", sorted_arg) base_fMRI_data = g_fMRI_data.copy() np.save(f"{folder}/base_fMRI_data.npy", base_fMRI_data) sorted_data = sorted_data[:, sorted_arg] base_fMRI_data = base_fMRI_data[:, sorted_arg] for step in tqdm(range(args.test_steps)): plt.close() plt.figure(figsize=[10, 4]) if test != 0 or seed != 0: x = sorted_data[step] plt.bar(np.arange(len(x)), x, color=[0.4, 0.7, 0.9, 0.5]) x = base_fMRI_data[step] plt.bar(np.arange(len(x)), x, color=[0.3, 0.3, 0.3, 0.5]) plt.savefig( f"{folder}/fMRI_videos/barchart_b{test}_s{seed}_{step:05}.png" ) plt.close() total_rewards.append(total_reward) distance_xs.append(distance_x) # avoid yaml turn float64 to numpy array total_rewards = [float(x) for x in total_rewards] distance_xs = [float(x) for x in distance_xs] data = { "title": "test", "train": train, "test": test, "total_reward": total_rewards, "distance_x": distance_xs, } with open(f"{save_file}", "w") as f: yaml.dump(data, f)
class BaseRLModel(ABC): """ The base RL model :param policy: (Type[BasePolicy]) Policy object :param env: (Union[GymEnv, str]) The environment to learn from (if registered in Gym, can be str. Can be None for loading trained models) :param policy_base: (Type[BasePolicy]) The base policy used by this method :param learning_rate: (float or callable) learning rate for the optimizer, it can be a function of the current progress (from 1 to 0) :param policy_kwargs: (Dict[str, Any]) Additional arguments to be passed to the policy on creation :param verbose: (int) The verbosity level: 0 none, 1 training information, 2 debug :param device: (Union[th.device, str]) Device on which the code should run. By default, it will try to use a Cuda compatible device and fallback to cpu if it is not possible. :param support_multi_env: (bool) Whether the algorithm supports training with multiple environments (as in A2C) :param create_eval_env: (bool) Whether to create a second environment that will be used for evaluating the agent periodically. (Only available when passing string for the environment) :param monitor_wrapper: (bool) When creating an environment, whether to wrap it or not in a Monitor wrapper. :param seed: (Optional[int]) Seed for the pseudo random generators :param use_sde: (bool) Whether to use generalized State Dependent Exploration (gSDE) instead of action noise exploration (default: False) :param sde_sample_freq: (int) Sample a new noise matrix every n steps when using gSDE Default: -1 (only sample at the beginning of the rollout) """ def __init__(self, policy: Type[BasePolicy], env: Union[GymEnv, str], policy_base: Type[BasePolicy], learning_rate: Union[float, Callable], policy_kwargs: Dict[str, Any] = None, verbose: int = 0, device: Union[th.device, str] = 'auto', support_multi_env: bool = False, create_eval_env: bool = False, monitor_wrapper: bool = True, seed: Optional[int] = None, use_sde: bool = False, sde_sample_freq: int = -1): if isinstance(policy, str) and policy_base is not None: self.policy_class = get_policy_from_name(policy_base, policy) else: self.policy_class = policy self.device = get_device(device) if verbose > 0: print(f"Using {self.device} device") self.env = None # type: Optional[GymEnv] # get VecNormalize object if needed self._vec_normalize_env = unwrap_vec_normalize(env) self.verbose = verbose self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs self.observation_space = None # type: Optional[gym.spaces.Space] self.action_space = None # type: Optional[gym.spaces.Space] self.n_envs = None self.num_timesteps = 0 self.eval_env = None self.seed = seed self.action_noise = None # type: Optional[ActionNoise] self.start_time = None self.policy = None self.learning_rate = learning_rate self.lr_schedule = None # type: Optional[Callable] self._last_obs = None # type: Optional[np.ndarray] # When using VecNormalize: self._last_original_obs = None # type: Optional[np.ndarray] self._episode_num = 0 # Used for gSDE only self.use_sde = use_sde self.sde_sample_freq = sde_sample_freq # Track the training progress (from 1 to 0) # this is used to update the learning rate self._current_progress = 1 # Buffers for logging self.ep_info_buffer = None # type: Optional[deque] self.ep_success_buffer = None # type: Optional[deque] # For logging self._n_updates = 0 # type: int # Create and wrap the env if needed if env is not None: if isinstance(env, str): if create_eval_env: eval_env = gym.make(env) if monitor_wrapper: eval_env = Monitor(eval_env, filename=None) self.eval_env = DummyVecEnv([lambda: eval_env]) if self.verbose >= 1: print( "Creating environment from the given name, wrapped in a DummyVecEnv." ) env = gym.make(env) if monitor_wrapper: env = Monitor(env, filename=None) env = DummyVecEnv([lambda: env]) env = self._wrap_env(env) self.observation_space = env.observation_space self.action_space = env.action_space self.n_envs = env.num_envs self.env = env if not support_multi_env and self.n_envs > 1: raise ValueError( "Error: the model does not support multiple envs requires a single vectorized" " environment.") def _wrap_env(self, env: GymEnv) -> VecEnv: if not isinstance(env, VecEnv): if self.verbose >= 1: print("Wrapping the env in a DummyVecEnv.") env = DummyVecEnv([lambda: env]) if is_image_space(env.observation_space) and not isinstance( env, VecTransposeImage): if self.verbose >= 1: print("Wrapping the env in a VecTransposeImage.") env = VecTransposeImage(env) return env @abstractmethod def _setup_model(self) -> None: """ Create networks, buffer and optimizers """ raise NotImplementedError() def _get_eval_env(self, eval_env: Optional[GymEnv]) -> Optional[GymEnv]: """ Return the environment that will be used for evaluation. :param eval_env: (Optional[GymEnv])) :return: (Optional[GymEnv]) """ if eval_env is None: eval_env = self.eval_env if eval_env is not None: eval_env = self._wrap_env(eval_env) assert eval_env.num_envs == 1 return eval_env def _setup_lr_schedule(self) -> None: """Transform to callable if needed.""" self.lr_schedule = get_schedule_fn(self.learning_rate) def _update_current_progress(self, num_timesteps: int, total_timesteps: int) -> None: """ Compute current progress (from 1 to 0) :param num_timesteps: current number of timesteps :param total_timesteps: """ self._current_progress = 1.0 - float(num_timesteps) / float( total_timesteps) def _update_learning_rate( self, optimizers: Union[List[th.optim.Optimizer], th.optim.Optimizer]) -> None: """ Update the optimizers learning rate using the current learning rate schedule and the current progress (from 1 to 0). :param optimizers: (Union[List[th.optim.Optimizer], th.optim.Optimizer]) An optimizer or a list of optimizers. """ # Log the current learning rate logger.logkv("learning_rate", self.lr_schedule(self._current_progress)) if not isinstance(optimizers, list): optimizers = [optimizers] for optimizer in optimizers: update_learning_rate(optimizer, self.lr_schedule(self._current_progress)) @staticmethod def safe_mean(arr: Union[np.ndarray, list, deque]) -> np.ndarray: """ Compute the mean of an array if there is at least one element. For empty array, return NaN. It is used for logging only. :param arr: :return: """ return np.nan if len(arr) == 0 else np.mean(arr) def get_env(self) -> Optional[VecEnv]: """ Returns the current environment (can be None if not defined). :return: (Optional[VecEnv]) The current environment """ return self.env def get_vec_normalize_env(self) -> Optional[VecNormalize]: """ Return the ``VecNormalize`` wrapper of the training env if it exists. :return: Optional[VecNormalize] The ``VecNormalize`` env. """ return self._vec_normalize_env @staticmethod def check_env(env: GymEnv, observation_space: gym.spaces.Space, action_space: gym.spaces.Space): """ Checks the validity of the environment to load vs the one used for training. Checked parameters: - observation_space - action_space :param env: (GymEnv) :param observation_space: (gym.spaces.Space) :param action_space: (gym.spaces.Space) """ if (observation_space != env.observation_space # Special cases for images that need to be transposed and not (is_image_space(env.observation_space) and observation_space == VecTransposeImage.transpose_space( env.observation_space))): raise ValueError( f'Observation spaces do not match: {observation_space} != {env.observation_space}' ) if action_space != env.action_space: raise ValueError( f'Action spaces do not match: {action_space} != {env.action_space}' ) def set_env(self, env: GymEnv) -> None: """ Checks the validity of the environment, and if it is coherent, set it as the current environment. Furthermore wrap any non vectorized env into a vectorized checked parameters: - observation_space - action_space :param env: The environment for learning a policy """ self.check_env(env, self.observation_space, self.action_space) # it must be coherent now # if it is not a VecEnv, make it a VecEnv env = self._wrap_env(env) self.n_envs = env.num_envs self.env = env def get_torch_variables(self) -> Tuple[List[str], List[str]]: """ Get the name of the torch variable that will be saved. ``th.save`` and ``th.load`` will be used with the right device instead of the default pickling strategy. :return: (Tuple[List[str], List[str]]) name of the variables with state dicts to save, name of additional torch tensors, """ state_dicts = ["policy"] return state_dicts, [] @abstractmethod def learn(self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 100, tb_log_name: str = "run", eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True) -> 'BaseRLModel': """ Return a trained model. :param total_timesteps: (int) The total number of samples to train on :param callback: (function (dict, dict)) -> boolean function called at every steps with state of the algorithm. It takes the local and global variables. If it returns False, training is aborted. :param log_interval: (int) The number of timesteps before logging. :param tb_log_name: (str) the name of the run for tensorboard log :param reset_num_timesteps: (bool) whether or not to reset the current timestep number (used in logging) :param eval_env: (gym.Env) Environment that will be used to evaluate the agent :param eval_freq: (int) Evaluate the agent every ``eval_freq`` timesteps (this may vary a little) :param n_eval_episodes: (int) Number of episode to evaluate the agent :param eval_log_path: (Optional[str]) Path to a folder where the evaluations will be saved :param reset_num_timesteps: (bool) :return: (BaseRLModel) the trained model """ raise NotImplementedError() def predict( self, observation: np.ndarray, state: Optional[np.ndarray] = None, mask: Optional[np.ndarray] = None, deterministic: bool = False ) -> Tuple[np.ndarray, Optional[np.ndarray]]: """ Get the model's action(s) from an observation :param observation: (np.ndarray) the input observation :param state: (Optional[np.ndarray]) The last states (can be None, used in recurrent policies) :param mask: (Optional[np.ndarray]) The last masks (can be None, used in recurrent policies) :param deterministic: (bool) Whether or not to return deterministic actions. :return: (Tuple[np.ndarray, Optional[np.ndarray]]) the model's action and the next state (used in recurrent policies) """ return self.policy.predict(observation, state, mask, deterministic) @classmethod def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs): """ Load the model from a zip-file :param load_path: the location of the saved data :param env: the new environment to run the loaded model on (can be None if you only need prediction from a trained model) has priority over any saved environment :param kwargs: extra arguments to change the model when loading """ data, params, tensors = cls._load_from_file(load_path) if 'policy_kwargs' in data: for arg_to_remove in ['device']: if arg_to_remove in data['policy_kwargs']: del data['policy_kwargs'][arg_to_remove] if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data[ 'policy_kwargs']: raise ValueError( f"The specified policy kwargs do not equal the stored policy kwargs." f"Stored kwargs: {data['policy_kwargs']}, specified kwargs: {kwargs['policy_kwargs']}" ) # check if observation space and action space are part of the saved parameters if ("observation_space" not in data or "action_space" not in data) and "env" not in data: raise ValueError( "The observation_space and action_space was not given, can't verify new environments" ) # check if given env is valid if env is not None: cls.check_env(env, data["observation_space"], data["action_space"]) # if no new env was given use stored env if possible if env is None and "env" in data: env = data["env"] # noinspection PyArgumentList model = cls(policy=data["policy_class"], env=env, device='auto', _init_setup_model=False) # load parameters model.__dict__.update(data) model.__dict__.update(kwargs) if not hasattr(model, "_setup_model") and len(params) > 0: raise NotImplementedError( f"{cls} has no ``_setup_model()`` method") model._setup_model() # put state_dicts back in place for name in params: attr = recursive_getattr(model, name) attr.load_state_dict(params[name]) # put tensors back in place if tensors is not None: for name in tensors: recursive_setattr(model, name, tensors[name]) return model @staticmethod def _load_from_file( load_path: str, load_data: bool = True ) -> (Tuple[Optional[Dict[str, Any]], Optional[TensorDict], Optional[TensorDict]]): """ Load model data from a .zip archive :param load_path: Where to load the model from :param load_data: Whether we should load and return data (class parameters). Mainly used by 'load_parameters' to only load model parameters (weights) :return: (dict),(dict),(dict) Class parameters, model state_dicts (dict of state_dict) and dict of extra tensors """ # Check if file exists if load_path is a string if isinstance(load_path, str): if not os.path.exists(load_path): if os.path.exists(load_path + ".zip"): load_path += ".zip" else: raise ValueError( f"Error: the file {load_path} could not be found") # set device to cpu if cuda is not available device = get_device() # Open the zip archive and load data try: with zipfile.ZipFile(load_path, "r") as archive: namelist = archive.namelist() # If data or parameters is not in the # zip archive, assume they were stored # as None (_save_to_file_zip allows this). data = None tensors = None params = {} if "data" in namelist and load_data: # Load class parameters and convert to string json_data = archive.read("data").decode() data = json_to_data(json_data) if "tensors.pth" in namelist and load_data: # Load extra tensors with archive.open('tensors.pth', mode="r") as tensor_file: # File has to be seekable, but opt_param_file is not, so load in BytesIO first # fixed in python >= 3.7 file_content = io.BytesIO() file_content.write(tensor_file.read()) # go to start of file file_content.seek(0) # load the parameters with the right ``map_location`` tensors = th.load(file_content, map_location=device) # check for all other .pth files other_files = [ file_name for file_name in namelist if os.path.splitext(file_name)[1] == ".pth" and file_name != "tensors.pth" ] # if there are any other files which end with .pth and aren't "params.pth" # assume that they each are optimizer parameters if len(other_files) > 0: for file_path in other_files: with archive.open(file_path, mode="r") as opt_param_file: # File has to be seekable, but opt_param_file is not, so load in BytesIO first # fixed in python >= 3.7 file_content = io.BytesIO() file_content.write(opt_param_file.read()) # go to start of file file_content.seek(0) # load the parameters with the right ``map_location`` params[os.path.splitext(file_path)[0]] = th.load( file_content, map_location=device) except zipfile.BadZipFile: # load_path wasn't a zip file raise ValueError(f"Error: the file {load_path} wasn't a zip-file") return data, params, tensors def set_random_seed(self, seed: Optional[int] = None) -> None: """ Set the seed of the pseudo-random generators (python, numpy, pytorch, gym, action_space) :param seed: (int) """ if seed is None: return set_random_seed(seed, using_cuda=self.device == th.device('cuda')) self.action_space.seed(seed) if self.env is not None: self.env.seed(seed) if self.eval_env is not None: self.eval_env.seed(seed) def _init_callback(self, callback: Union[None, Callable, List[BaseCallback], BaseCallback], eval_env: Optional[VecEnv] = None, eval_freq: int = 10000, n_eval_episodes: int = 5, log_path: Optional[str] = None) -> BaseCallback: """ :param callback: (Union[callable, [BaseCallback], BaseCallback, None]) :return: (BaseCallback) """ # Convert a list of callbacks into a callback if isinstance(callback, list): callback = CallbackList(callback) # Convert functional callback to object if not isinstance(callback, BaseCallback): callback = ConvertCallback(callback) # Create eval callback in charge of the evaluation if eval_env is not None: eval_callback = EvalCallback(eval_env, best_model_save_path=log_path, log_path=log_path, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes) callback = CallbackList([callback, eval_callback]) callback.init_callback(self) return callback def _setup_learn( self, eval_env: Optional[GymEnv], callback: Union[None, Callable, List[BaseCallback], BaseCallback] = None, eval_freq: int = 10000, n_eval_episodes: int = 5, log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> 'BaseCallback': """ Initialize different variables needed for training. :param eval_env: (Optional[GymEnv]) :param callback: (Union[None, BaseCallback, List[BaseCallback, Callable]]) :param eval_freq: (int) :param n_eval_episodes: (int) :param log_path (Optional[str]): Path to a log folder :param reset_num_timesteps: (bool) Whether to reset or not the ``num_timesteps`` attribute :return: (BaseCallback) """ self.start_time = time.time() self.ep_info_buffer = deque(maxlen=100) self.ep_success_buffer = deque(maxlen=100) if self.action_noise is not None: self.action_noise.reset() if reset_num_timesteps: self.num_timesteps = 0 self._episode_num = 0 # Avoid resetting the environment when calling ``.learn()`` consecutive times if reset_num_timesteps or self._last_obs is None: self._last_obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: self._last_original_obs = self._vec_normalize_env.get_original_obs( ) if eval_env is not None and self.seed is not None: eval_env.seed(self.seed) eval_env = self._get_eval_env(eval_env) # Create eval callback if needed callback = self._init_callback(callback, eval_env, eval_freq, n_eval_episodes, log_path) return callback def _update_info_buffer(self, infos: List[Dict[str, Any]], dones: Optional[np.ndarray] = None) -> None: """ Retrieve reward and episode length and update the buffer if using Monitor wrapper. :param infos: ([dict]) """ if dones is None: dones = np.array([False] * len(infos)) for idx, info in enumerate(infos): maybe_ep_info = info.get('episode') maybe_is_success = info.get('is_success') if maybe_ep_info is not None: self.ep_info_buffer.extend([maybe_ep_info]) if maybe_is_success is not None and dones[idx]: self.ep_success_buffer.append(maybe_is_success) @staticmethod def _save_to_file_zip(save_path: str, data: Dict[str, Any] = None, params: Dict[str, Any] = None, tensors: Dict[str, Any] = None) -> None: """ Save model to a zip archive. :param save_path: Where to store the model :param data: Class parameters being stored :param params: Model parameters being stored expected to contain an entry for every state_dict with its name and the state_dict :param tensors: Extra tensor variables expected to contain name and value of tensors """ # data/params can be None, so do not # try to serialize them blindly if data is not None: serialized_data = data_to_json(data) # Check postfix if save_path is a string if isinstance(save_path, str): _, ext = os.path.splitext(save_path) if ext == "": save_path += ".zip" # Create a zip-archive and write our objects # there. This works when save_path is either # str or a file-like with zipfile.ZipFile(save_path, "w") as archive: # Do not try to save "None" elements if data is not None: archive.writestr("data", serialized_data) if tensors is not None: with archive.open('tensors.pth', mode="w") as tensors_file: th.save(tensors, tensors_file) if params is not None: for file_name, dict_ in params.items(): with archive.open(file_name + '.pth', mode="w") as param_file: th.save(dict_, param_file) def excluded_save_params(self) -> List[str]: """ Returns the names of the parameters that should be excluded by default when saving the model. :return: ([str]) List of parameters that should be excluded from save """ return [ "policy", "device", "env", "eval_env", "replay_buffer", "rollout_buffer", "_vec_normalize_env" ] def save(self, path: str, exclude: Optional[List[str]] = None, include: Optional[List[str]] = None) -> None: """ Save all the attributes of the object and the model parameters in a zip-file. :param path: path to the file where the rl agent should be saved :param exclude: name of parameters that should be excluded in addition to the default one :param include: name of parameters that might be excluded but should be included anyway """ # copy parameter list so we don't mutate the original dict data = self.__dict__.copy() # use standard list of excluded parameters if none given if exclude is None: exclude = self.excluded_save_params() else: # append standard exclude params to the given params exclude.extend([ param for param in self.excluded_save_params() if param not in exclude ]) # do not exclude params if they are specifically included if include is not None: exclude = [ param_name for param_name in exclude if param_name not in include ] state_dicts_names, tensors_names = self.get_torch_variables() # any params that are in the save vars must not be saved by data torch_variables = state_dicts_names + tensors_names for torch_var in torch_variables: # we need to get only the name of the top most module as we'll remove that var_name = torch_var.split('.')[0] exclude.append(var_name) # Remove parameter entries of parameters which are to be excluded for param_name in exclude: if param_name in data: data.pop(param_name, None) # Build dict of tensor variables tensors = None if tensors_names is not None: tensors = {} for name in tensors_names: attr = recursive_getattr(self, name) tensors[name] = attr # Build dict of state_dicts params_to_save = {} for name in state_dicts_names: attr = recursive_getattr(self, name) # Retrieve state dict params_to_save[name] = attr.state_dict() self._save_to_file_zip(path, data=data, params=params_to_save, tensors=tensors)
if args.smp_bodies_aligned: common.args.custom_alignment = "::".join( [cheetah_orders[x] for x in robot_bodies]) hyperparams = common.load_hyperparameters(conf_name=args.rl_hyperparameter) common.args.model_filename = "output_data/tmp/aligned/sd0/best_model.zip" # for test_body in robot_bodies+zero_shot_bodies: for test_body in ["cheetah_3_balanced"]: if args.smp_bodies_aligned: common.args.custom_alignment = cheetah_orders[test_body] # if test_body == "cheetah_6_front": # eval_venv = DummyVecEnv([make_env(test_body)]) # else: eval_venv = DummyVecEnv([make_env(test_body, rank=0)]) # 1 avoid visualization if args.vec_normalize: eval_venv = VecNormalize.load( common.get_vec_pkl_from_model_filename(args.model_filename), eval_venv) hyperparams = common.clean_hyperparams_before_run(hyperparams) model = PPO.load(common.args.model_filename, env=eval_venv, **hyperparams) obs = eval_venv.reset() while True: action = model.predict(obs) obs, reward, done, _ = eval_venv.step(action[0]) if done:
"total_timesteps": 200, "env_name": "CartPole-v1", } run = wandb.init( project="sb3", config=config, sync_tensorboard=True, # auto-upload sb3's tensorboard metrics save_code=True, # optional ) def make_env(): env = gym.make(config["env_name"]) env = Monitor(env) # record stats such as returns return env env = DummyVecEnv([make_env]) model = PPO(config["policy_type"], env, verbose=1, tensorboard_log=f"runs/{run.name}") model.learn( total_timesteps=config["total_timesteps"], callback=WandbCallback( gradient_save_freq=100, model_save_path=f"models/{run.name}", ), )
def __init__(self, dim_room=(10, 10), num_boxes=4, reset=True, log_interval=1000, alg_version=0, train_mode='cnn', agent_lb_path=None, agent_ub_path=None, init_probs=[0.5, 0.5, 0.5]): assert train_mode in TRAIN_MODES self.train_mode = train_mode if log_interval > 0: self.log_train_info = True else: self.log_train_info = False # 0: basic playable map # 1: playble map # 2: hardness adjustable map self.alg_version = alg_version if alg_version == 0: pass else: env_li = [ lambda: SokobanEnv(dim_room=dim_room, max_steps=50, num_boxes=num_boxes, train_mode=train_mode, log_train_info=False) ] self.soko_env = DummyVecEnv(env_li) self.agent_ub = PPO.load(agent_ub_path, env=self.soko_env) print('loaded', agent_ub_path, 'as ub') if alg_version == 2: self.agent_lb = PPO.load(agent_lb_path, env=self.soko_env) print('loaded', agent_lb_path, 'as lb') # General Configuration self.dim_room = dim_room self.num_boxes = num_boxes self.num_players = 1 # Training hyperperams self.max_prefer_subs = dim_room[0] * dim_room[1] // 2 self.place_target_prob = init_probs[0] self.place_box_prob = init_probs[1] self.place_player_prob = init_probs[2] # Log info self.start_time = time.time() self.train_result_summary = {-1: 0, 0: 0, 1: 0, 2: 0} self.fail_type_summary = {-1: 0, 0: 0, 1: 0, 2: 0} # self.sample_map = False self.episode_reward = 0 self.total_reward_per_log_interval = 0 self.total_steps_per_log_interval = 0 self.total_subs_per_log_interval = 0 self.log_interval = log_interval self.reseted = False self.train_counter = 0 # Env properties self.map = None # Penalties and Rewards self.penalty_sub_wrong_tile = -5 self.penalty_exc_btp_tiles = -10 self.penalty_bad_map_design = -50 self.penalty_generation_fail = -50 self.penalty_exc_subs = -10 self.reward_neighbor_valid_tiles = 2 self.reward_place_btp_tiles = 5 self.reward_basic_playable = 40 if alg_version == 1: # too hard or unsolvable self.penalty_agent_ub_thou = -30 self.reward_agent_ub_solvable = 50 elif alg_version == 2: self.penalty_agent_lb_solvable = -30 self.penalty_agent_ub_thou = -30 self.reward_agent_ub_solvable = 10 self.reward_agent_lb_thou = 50 # Generation Track self.placed_player = 0 self.placed_boxes = 0 self.placed_target = 0 self.env_steps = 0 # Env Settings self.viewer = None self.max_steps = dim_room[0] * dim_room[1] self.action_space = MultiDiscrete([dim_room[0], dim_room[1], 5]) if train_mode == 'cnn': self.scale = 6 screen_height, screen_width = (dim_room[0] * self.scale, dim_room[1] * self.scale) self.observation_space = Box(low=0, high=255, shape=(screen_height, screen_width, 3), dtype=np.uint8) else: self.observation_space = Box(low=0, high=6, shape=(dim_room[0], dim_room[1]), dtype=np.uint8) if reset: # Initialize Room _ = self.reset()
import gym import time from stable_baselines3 import DQN from stable_baselines3.common.monitor import Monitor from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage from stable_baselines3.common.evaluation import evaluate_policy from stable_baselines3.common.callbacks import EvalCallback # Create a DummyVecEnv for main airsim gym env env = DummyVecEnv([ lambda: Monitor( gym.make( "airgym:airsim-car-sample-v0", ip_address="127.0.0.1", image_shape=(84, 84, 1), )) ]) # Wrap env as VecTransposeImage to allow SB to handle frame observations env = VecTransposeImage(env) # Initialize RL algorithm type and parameters model = DQN( "CnnPolicy", env, learning_rate=0.00025, verbose=1, batch_size=32, train_freq=4, target_update_interval=10000,
if args.realign_method != "": default_wrapper.append(wrapper.ReAlignedWrapper) elif args.topology_wrapper == "diff": default_wrapper.append(wrapper_diff.get_wrapper_class()) elif args.topology_wrapper == "MutantWrapper": default_wrapper.append(wrapper_mut.MutantWrapper) elif args.topology_wrapper == "CustomAlignWrapper": default_wrapper.append(wrapper_custom_align.CustomAlignWrapper) else: pass # no need for wrapper for rank_idx, test_body in enumerate(args.test_bodies): eval_venv = DummyVecEnv([ gym_interface.make_env(rank=rank_idx, seed=common.seed, wrappers=default_wrapper, force_render=args.render, robot_body=test_body, dataset_folder="../input_data/bodies") ]) if args.vec_normalize: raise NotImplementedError # normalize_kwargs["gamma"] = hyperparams["gamma"] # eval_venv = VecNormalize(eval_venv, **normalize_kwargs) if args.stack_frames > 1: eval_venv = VecFrameStack(eval_venv, args.stack_frames) eval_venv.seed(common.seed) model = PPO.load(args.model_filename) obs = eval_venv.reset()
def run_ensemble_strategy(df, unique_trade_date, rebalance_window, validation_window) -> None: """Ensemble Strategy that combines PPO, A2C and DDPG""" print("============Start Ensemble Strategy============") # for ensemble model, it's necessary to feed the last state # of the previous model to the current model as the initial state last_state_ensemble = [] ppo_sharpe_list = [] ddpg_sharpe_list = [] a2c_sharpe_list = [] model_use = [] # based on the analysis of the in-sample data #turbulence_threshold = 140 insample_turbulence = df[(df.datadate<20151000) & (df.datadate>=20090000)] insample_turbulence = insample_turbulence.drop_duplicates(subset=['datadate']) insample_turbulence_threshold = np.quantile(insample_turbulence.turbulence.values, .90) start = time.time() for i in range(rebalance_window + validation_window, len(unique_trade_date), rebalance_window): print("============================================") ## initial state is empty if i - rebalance_window - validation_window == 0: # inital state initial = True else: # previous state initial = False # Tuning trubulence index based on historical data # Turbulence lookback window is one quarter end_date_index = df.index[df["datadate"] == unique_trade_date[i - rebalance_window - validation_window]].to_list()[-1] start_date_index = end_date_index - validation_window*30 + 1 historical_turbulence = df.iloc[start_date_index:(end_date_index + 1), :] #historical_turbulence = df[(df.datadate<unique_trade_date[i - rebalance_window - validation_window]) & (df.datadate>=(unique_trade_date[i - rebalance_window - validation_window - 63]))] historical_turbulence = historical_turbulence.drop_duplicates(subset=['datadate']) historical_turbulence_mean = np.mean(historical_turbulence.turbulence.values) if historical_turbulence_mean > insample_turbulence_threshold: # if the mean of the historical data is greater than the 90% quantile of insample turbulence data # then we assume that the current market is volatile, # therefore we set the 90% quantile of insample turbulence data as the turbulence threshold # meaning the current turbulence can't exceed the 90% quantile of insample turbulence data turbulence_threshold = insample_turbulence_threshold else: # if the mean of the historical data is less than the 90% quantile of insample turbulence data # then we tune up the turbulence_threshold, meaning we lower the risk turbulence_threshold = np.quantile(insample_turbulence.turbulence.values, 1) print("turbulence_threshold: ", turbulence_threshold) ############## Environment Setup starts ############## ## training env train = data_split(df, start=20090000, end=unique_trade_date[i - rebalance_window - validation_window]) env_train = DummyVecEnv([lambda: StockEnvTrain(train)]) ## validation env validation = data_split(df, start=unique_trade_date[i - rebalance_window - validation_window], end=unique_trade_date[i - rebalance_window]) env_val = DummyVecEnv([lambda: StockEnvValidation(validation, turbulence_threshold=turbulence_threshold, iteration=i)]) obs_val = env_val.reset() ############## Environment Setup ends ############## ############## Training and Validation starts ############## print("======Model training from: ", 20090000, "to ", unique_trade_date[i - rebalance_window - validation_window]) # print("training: ",len(data_split(df, start=20090000, end=test.datadate.unique()[i-rebalance_window]) )) # print("==============Model Training===========") print("======A2C Training========") model_a2c = train_A2C(env_train, model_name="A2C_30k_dow_{}".format(i), timesteps=30000) print("======A2C Validation from: ", unique_trade_date[i - rebalance_window - validation_window], "to ", unique_trade_date[i - rebalance_window]) DRL_validation(model=model_a2c, test_data=validation, test_env=env_val, test_obs=obs_val) sharpe_a2c = get_validation_sharpe(i) print("A2C Sharpe Ratio: ", sharpe_a2c) print("======PPO Training========") model_ppo = train_PPO(env_train, model_name="PPO_100k_dow_{}".format(i), timesteps=100000) print("======PPO Validation from: ", unique_trade_date[i - rebalance_window - validation_window], "to ", unique_trade_date[i - rebalance_window]) DRL_validation(model=model_ppo, test_data=validation, test_env=env_val, test_obs=obs_val) sharpe_ppo = get_validation_sharpe(i) print("PPO Sharpe Ratio: ", sharpe_ppo) print("======DDPG Training========") model_ddpg = train_DDPG(env_train, model_name="DDPG_10k_dow_{}".format(i), timesteps=10000) #model_ddpg = train_TD3(env_train, model_name="DDPG_10k_dow_{}".format(i), timesteps=20000) print("======DDPG Validation from: ", unique_trade_date[i - rebalance_window - validation_window], "to ", unique_trade_date[i - rebalance_window]) DRL_validation(model=model_ddpg, test_data=validation, test_env=env_val, test_obs=obs_val) sharpe_ddpg = get_validation_sharpe(i) ppo_sharpe_list.append(sharpe_ppo) a2c_sharpe_list.append(sharpe_a2c) ddpg_sharpe_list.append(sharpe_ddpg) # Model Selection based on sharpe ratio if (sharpe_ppo >= sharpe_a2c) & (sharpe_ppo >= sharpe_ddpg): model_ensemble = model_ppo model_use.append('PPO') elif (sharpe_a2c > sharpe_ppo) & (sharpe_a2c > sharpe_ddpg): model_ensemble = model_a2c model_use.append('A2C') else: model_ensemble = model_ddpg model_use.append('DDPG') ############## Training and Validation ends ############## ############## Trading starts ############## print("======Trading from: ", unique_trade_date[i - rebalance_window], "to ", unique_trade_date[i]) #print("Used Model: ", model_ensemble) last_state_ensemble = DRL_prediction(df=df, model=model_ensemble, name="ensemble", last_state=last_state_ensemble, iter_num=i, unique_trade_date=unique_trade_date, rebalance_window=rebalance_window, turbulence_threshold=turbulence_threshold, initial=initial) # print("============Trading Done============") ############## Trading ends ############## end = time.time() print("Ensemble Strategy took: ", (end - start) / 60, " minutes")
}, }) env.reset() return env def test_env(): env = train_env() env.configure({"policy_frequency": 15, "duration": 20 * 15}) env.reset() return env if __name__ == '__main__': # Train model = DQN('CnnPolicy', DummyVecEnv([train_env]), learning_rate=5e-4, buffer_size=15000, learning_starts=200, batch_size=32, gamma=0.8, train_freq=1, gradient_steps=1, target_update_interval=50, exploration_fraction=0.7, verbose=1, tensorboard_log="highway_cnn/") model.learn(total_timesteps=int(1e5)) model.save("highway_cnn/model") # Record video
# if args.cnspns: # hard code for now. could be automatically determined. # _w = wrapper_pns.make_same_dim_wrapper(obs_dim=28, action_dim=8) # default_wrapper.append(_w) assert len(args.train_bodies) > 0, "No body to train." if args.with_bodyinfo: default_wrapper.append(wrapper.BodyinfoWrapper) print("Making train environments...") venv = DummyVecEnv([ gym_interface.make_env( rank=i, seed=common.seed, wrappers=default_wrapper, render=args.render, robot_body=args.train_bodies[i % len(args.train_bodies)], dataset_folder=args.body_folder) for i in range(args.num_venvs) ]) normalize_kwargs = {} if args.vec_normalize: normalize_kwargs["gamma"] = hyperparams["gamma"] if len(args.model_filename) > 0: venv = VecNormalize.load( common.get_vec_pkl_from_model_filename(args.model_filename), venv) else: venv = VecNormalize(venv, **normalize_kwargs)
def setup_test(self): env_fun = my_utils.import_env(env_config["env_name"]) self.env = DummyVecEnv([lambda: env_fun(config)]) self.policy = my_utils.make_par_policy(self.env, config) self.policy.load_state_dict(T.load(config["test_agent_path"]))
def make_vec_env( env_name: str, n_envs: int = 8, seed: int = 0, parallel: bool = False, log_dir: Optional[str] = None, max_episode_steps: Optional[int] = None, post_wrappers: Optional[Sequence[Callable[[gym.Env, int], gym.Env]]] = None, ) -> VecEnv: """Returns a VecEnv initialized with `n_envs` Envs. Args: env_name: The Env's string id in Gym. n_envs: The number of duplicate environments. seed: The environment seed. parallel: If True, uses SubprocVecEnv; otherwise, DummyVecEnv. log_dir: If specified, saves Monitor output to this directory. max_episode_steps: If specified, wraps each env in a TimeLimit wrapper with this episode length. If not specified and `max_episode_steps` exists for this `env_name` in the Gym registry, uses the registry `max_episode_steps` for every TimeLimit wrapper (this automatic wrapper is the default behavior when calling `gym.make`). Otherwise the environments are passed into the VecEnv unwrapped. post_wrappers: If specified, iteratively wraps each environment with each of the wrappers specified in the sequence. The argument should be a Callable accepting two arguments, the Env to be wrapped and the environment index, and returning the wrapped Env. """ # Resolve the spec outside of the subprocess first, so that it is available to # subprocesses running `make_env` via automatic pickling. #spec = gym.spec(env_name) def make_env(i, this_seed): # Previously, we directly called `gym.make(env_name)`, but running # `imitation.scripts.train_adversarial` within `imitation.scripts.parallel` # created a weird interaction between Gym and Ray -- `gym.make` would fail # inside this function for any of our custom environment unless those # environments were also `gym.register()`ed inside `make_env`. Even # registering the custom environment in the scope of `make_vec_env` didn't # work. For more discussion and hypotheses on this issue see PR #160: # https://github.com/HumanCompatibleAI/imitation/pull/160. #env = spec.make() target_machine_ip = '127.0.0.1' # for a simulated robot environment env = gym.make(env_name, ip=target_machine_ip, gui=False) env = ExceptionHandling(env) # Seed each environment with a different, non-sequential seed for diversity # (even if caller is passing us sequentially-assigned base seeds). int() is # necessary to work around gym bug where it chokes on numpy int64s. env.seed(int(this_seed)) if max_episode_steps is not None: env = TimeLimit(env, max_episode_steps) elif spec.max_episode_steps is not None: env = TimeLimit(env, max_episode_steps=spec.max_episode_steps) # Use Monitor to record statistics needed for Baselines algorithms logging # Optionally, save to disk log_path = None if log_dir is not None: log_subdir = os.path.join(log_dir, "monitor") os.makedirs(log_subdir, exist_ok=True) log_path = os.path.join(log_subdir, f"mon{i:03d}") env = monitor.Monitor(env, log_path) env = wrappers.RolloutInfoWrapper(env) if post_wrappers: for wrapper in post_wrappers: env = wrapper(env, i) return env rng = np.random.RandomState(seed) env_seeds = rng.randint(0, (1 << 31) - 1, (n_envs,)) env_fns = [functools.partial(make_env, i, s) for i, s in enumerate(env_seeds)] if parallel: # See GH hill-a/stable-baselines issue #217 return SubprocVecEnv(env_fns, start_method="forkserver") else: return DummyVecEnv(env_fns)
def test_save_load_policy(tmp_path, model_class, policy_str): """ Test saving and loading policy only. :param model_class: (BaseAlgorithm) A RL model :param policy_str: (str) Name of the policy. """ kwargs = {} if policy_str == "MlpPolicy": env = select_env(model_class) else: if model_class in [SAC, TD3, DQN]: # Avoid memory error when using replay buffer # Reduce the size of the features kwargs = dict(buffer_size=250) env = FakeImageEnv(screen_height=40, screen_width=40, n_channels=2, discrete=model_class == DQN) env = DummyVecEnv([lambda: env]) # create model model = model_class(policy_str, env, policy_kwargs=dict(net_arch=[16]), verbose=1, **kwargs) model.learn(total_timesteps=500, eval_freq=250) env.reset() observations = np.concatenate( [env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0) policy = model.policy policy_class = policy.__class__ actor, actor_class = None, None if model_class in [SAC, TD3]: actor = policy.actor actor_class = actor.__class__ # Get dictionary of current parameters params = deepcopy(policy.state_dict()) # Modify all parameters to be random values random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items()) # Update model parameters with the new random values policy.load_state_dict(random_params) new_params = policy.state_dict() # Check that all params are different now for k in params: assert not th.allclose( params[k], new_params[k]), "Parameters did not change as expected." params = new_params # get selected actions selected_actions, _ = policy.predict(observations, deterministic=True) # Should also work with the actor only if actor is not None: selected_actions_actor, _ = actor.predict(observations, deterministic=True) # Save and load policy policy.save(tmp_path / "policy.pkl") # Save and load actor if actor is not None: actor.save(tmp_path / "actor.pkl") del policy, actor policy = policy_class.load(tmp_path / "policy.pkl") if actor_class is not None: actor = actor_class.load(tmp_path / "actor.pkl") # check if params are still the same after load new_params = policy.state_dict() # Check that all params are the same as before save load procedure now for key in params: assert th.allclose( params[key], new_params[key] ), "Policy parameters not the same after save and load." # check if model still selects the same actions new_selected_actions, _ = policy.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) if actor_class is not None: new_selected_actions_actor, _ = actor.predict(observations, deterministic=True) assert np.allclose(selected_actions_actor, new_selected_actions_actor, 1e-4) assert np.allclose(selected_actions_actor, new_selected_actions, 1e-4) # clear file from os os.remove(tmp_path / "policy.pkl") if actor_class is not None: os.remove(tmp_path / "actor.pkl")
def get_sb_env(self): e = DummyVecEnv([lambda: self]) obs = e.reset() return e, obs
def test_save_load(tmp_path, model_class): """ Test if 'save' and 'load' saves and loads model correctly and if 'load_parameters' and 'get_policy_parameters' work correctly ''warning does not test function of optimizer parameter load :param model_class: (BaseAlgorithm) A RL model """ env = DummyVecEnv([lambda: select_env(model_class)]) # create model model = model_class("MlpPolicy", env, policy_kwargs=dict(net_arch=[16]), verbose=1) model.learn(total_timesteps=500, eval_freq=250) env.reset() observations = np.concatenate( [env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0) # Get dictionary of current parameters params = deepcopy(model.policy.state_dict()) # Modify all parameters to be random values random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items()) # Update model parameters with the new random values model.policy.load_state_dict(random_params) new_params = model.policy.state_dict() # Check that all params are different now for k in params: assert not th.allclose( params[k], new_params[k]), "Parameters did not change as expected." params = new_params # get selected actions selected_actions, _ = model.predict(observations, deterministic=True) # Check model.save(tmp_path / "test_save.zip") del model model = model_class.load(str(tmp_path / "test_save.zip"), env=env) # check if params are still the same after load new_params = model.policy.state_dict() # Check that all params are the same as before save load procedure now for key in params: assert th.allclose( params[key], new_params[key] ), "Model parameters not the same after save and load." # check if model still selects the same actions new_selected_actions, _ = model.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # check if learn still works model.learn(total_timesteps=1000, eval_freq=500) # clear file from os os.remove(tmp_path / "test_save.zip")
env = wrap_pytorch( wrap_deepmind( env, clip_rewards=True, frame_stack=True, scale=False, )) env.seed(seed) env.action_space.seed(seed) env.observation_space.seed(seed) return env return thunk envs = DummyVecEnv( [make_env(args.gym_id, args.seed + i, i) for i in range(args.num_envs)]) # if args.prod_mode: # envs = VecPyTorch( # SubprocVecEnv([make_env(args.gym_id, args.seed+i, i) for i in range(args.num_envs)], "fork"), # device # ) assert isinstance(envs.action_space, Discrete), "only discrete action space is supported" # ALGO LOGIC: initialize agent here: class Scale(nn.Module): def __init__(self, scale): super().__init__() self.scale = scale
def create_env(n_envs, eval_env=False, no_log=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :param no_log: (bool) Do not log training when doing hyperparameter optim (issue with writing the same file) :return: (Union[gym.Env, VecEnv]) """ global hyperparams global env_kwargs, eval_env_kwargs global normalize if eval_env: kwargs = eval_env_kwargs else: kwargs = env_kwargs # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else save_path if n_envs == 1: # use rank=127 so eval_env won't overlap with any training_env. env = DummyVecEnv([ make_env(env_id, 127, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=kwargs[0]) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, log_dir=log_dir, env_kwargs=kwargs[i], wrapper_class=env_wrapper) for i in range(n_envs) ]) if normalize: # Copy to avoid changing default values by reference local_normalize_kwargs = normalize_kwargs.copy() # Do not normalize reward for env used for evaluation if eval_env: if len(local_normalize_kwargs) > 0: local_normalize_kwargs["norm_reward"] = False else: local_normalize_kwargs = {"norm_reward": False} if args.verbose > 0: if len(local_normalize_kwargs) > 0: print(f"Normalization activated: {local_normalize_kwargs}") else: print("Normalizing input and reward") env = VecNormalize(env, **local_normalize_kwargs) return env
args = common.args args.model_filename = "output_data/tmp/best_model.zip" args.test_bodies = [320] args.stack_frames = 4 args.test_steps = 1000 args.render = True print(args) default_wrapper = [wrapper.WalkerWrapper] assert len(args.train_bodies) == 0, "No need for body to train." if args.with_bodyinfo: default_wrapper += [wrapper.BodyinfoWrapper] for test_body in args.test_bodies: eval_venv = DummyVecEnv([gym_interface.make_env(rank=0, seed=common.seed, wrappers=default_wrapper, render=args.render, robot_body=test_body)]) if args.vec_normalize: raise NotImplementedError # normalize_kwargs["gamma"] = hyperparams["gamma"] # eval_venv = VecNormalize(eval_venv, **normalize_kwargs) if args.stack_frames > 1: eval_venv = VecFrameStack(eval_venv, args.stack_frames) eval_venv.seed(common.seed) model = PPO.load(args.model_filename) obs = eval_venv.reset() g_obs_data = np.zeros(shape=[args.test_steps, obs.shape[1]], dtype=np.float32) if True:
default_wrapper.append(wrapper_mut.MutantWrapper) elif args.topology_wrapper == "CustomAlignWrapper": default_wrapper.append(wrapper_custom_align.CustomAlignWrapper) else: pass # no need for wrapper assert len(args.robo_bodies) > 0, "No body to train." if args.with_bodyinfo: default_wrapper.append(wrapper.BodyinfoWrapper) print("Making train environments...") venv = DummyVecEnv([ gym_interface.make_pyrobotdesign_env( rank=i, seed=common.seed, wrappers=default_wrapper, render=args.render, dataset_folder=args.dataset_folder, robo_body=args.robo_bodies[i % len(args.robo_bodies)]) for i in range(args.num_venvs) ]) normalize_kwargs = {} if args.vec_normalize: normalize_kwargs["gamma"] = hyperparams["gamma"] venv = VecNormalize(venv, **normalize_kwargs) if args.stack_frames > 1: venv = VecFrameStack(venv, args.stack_frames) keys_remove = ["normalize", "n_envs", "n_timesteps", "policy"] for key in keys_remove:
def test_save_load_q_net(tmp_path, model_class, policy_str): """ Test saving and loading q-network/quantile net only. :param model_class: (BaseAlgorithm) A RL model :param policy_str: (str) Name of the policy. """ kwargs = dict(policy_kwargs=dict(net_arch=[16])) if policy_str == "MlpPolicy": env = select_env(model_class) else: if model_class in [QRDQN]: # Avoid memory error when using replay buffer # Reduce the size of the features kwargs = dict( buffer_size=250, learning_starts=100, policy_kwargs=dict(features_extractor_kwargs=dict( features_dim=32)), ) env = FakeImageEnv(screen_height=40, screen_width=40, n_channels=2, discrete=model_class == QRDQN) # Reduce number of quantiles for faster tests if model_class in [QRDQN]: kwargs["policy_kwargs"].update(dict(n_quantiles=20)) env = DummyVecEnv([lambda: env]) # create model model = model_class(policy_str, env, verbose=1, **kwargs) model.learn(total_timesteps=300) env.reset() observations = np.concatenate( [env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0) q_net = model.quantile_net q_net_class = q_net.__class__ # Get dictionary of current parameters params = deepcopy(q_net.state_dict()) # Modify all parameters to be random values random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items()) # Update model parameters with the new random values q_net.load_state_dict(random_params) new_params = q_net.state_dict() # Check that all params are different now for k in params: assert not th.allclose( params[k], new_params[k]), "Parameters did not change as expected." params = new_params # get selected actions selected_actions, _ = q_net.predict(observations, deterministic=True) # Save and load q_net q_net.save(tmp_path / "q_net.pkl") del q_net q_net = q_net_class.load(tmp_path / "q_net.pkl") # check if params are still the same after load new_params = q_net.state_dict() # Check that all params are the same as before save load procedure now for key in params: assert th.allclose( params[key], new_params[key] ), "Policy parameters not the same after save and load." # check if model still selects the same actions new_selected_actions, _ = q_net.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # clear file from os os.remove(tmp_path / "q_net.pkl")
training_bodies = [int(x) for x in args.train_bodies.split(",")] str_ids = "-".join(str(x) for x in training_bodies) if args.test_bodies=="": test_bodies = [] else: test_bodies = [int(x) for x in args.test_bodies.split(",")] # default_wrapper = wrapper.BodyinfoWrapper # if args.disable_wrapper: # default_wrapper = None default_wrapper = wrapper.WalkerWrapper # default_wrapper = None if with_bodyinfo: env = DummyVecEnv([utils.make_env(template=utils.template(training_bodies[i%len(training_bodies)]), rank=i, seed=utils.seed, wrapper=default_wrapper, render=args.render, robot_body=training_bodies[i%len(training_bodies)], body_info=training_bodies[i%len(training_bodies)]//100) for i in range(train_num_envs)]) save_filename = f"model-ant-{str_ids}-with-bodyinfo" else: env = DummyVecEnv([utils.make_env(template=utils.template(training_bodies[i%len(training_bodies)]), rank=i, seed=utils.seed, wrapper=default_wrapper, render=args.render, robot_body=training_bodies[i%len(training_bodies)], body_info=0) for i in range(train_num_envs)]) save_filename = f"model-ant-{str_ids}" if args.vec_normalize: env = VecNormalize(env, **normalize_kwargs) if args.stack_frames>1: env = VecFrameStack(env, args.stack_frames) keys_remove =["normalize", "n_envs", "n_timesteps", "policy"] for key in keys_remove: del hyperparams[key]