def run(config: Dict[str, Any], logdir: pathlib.PosixPath): env = make_env(config) if config["mode"] == "evaluate": print("Start evaluation.") model = PPO.load(logdir / "model.zip") elif config["mode"] == "train" and args.logdir: print("Start training from existing model.") model = PPO.load(logdir / "model.zip") model.set_env(env) model.learn(total_timesteps=config["train_steps"]) else: print("Start training.") model = PPO( "CnnPolicy", env, verbose=1, tensorboard_log=logdir / "tensorboard", use_sde=True, ) model.learn(total_timesteps=config["train_steps"]) mean_reward, std_reward = evaluate_policy( model, env, n_eval_episodes=config["eval_eps"], deterministic=True) print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}") if config["mode"] == "train": model.save(logdir / "model") env.close()
def load_new_opp(self, idx, opp_fp, opp_elo): if idx < len(self.opponents): self.opponents[idx] = (PPO.load(opp_fp), opp_elo, opp_fp) self.curr_opp = idx else: self.opponents.append((PPO.load(opp_fp), opp_elo, opp_fp)) self.curr_opp = len(self.opponents) - 1
def train(): """Trains a PPO2 policy.""" env_args = env_parser.parse_known_args()[0] policy_args = policy_parser.parse_known_args()[0] opt_args = opt_parser.parse_known_args()[0] os.makedirs(opt_args.save_path, exist_ok=True) # create environment # train_env = GFootballEnv(env_args) # for evaluation train_env = DummyVecEnv([ make_env(env_args, opt_args.save_path, rank=i) for i in range(opt_args.num_envs) ]) eval_env = GFootballEnv(env_args) # for evaluation check_env(env=eval_env, warn=True) # define rl policy/value network policy = getattr(sys.modules[__name__], policy_args.policy) # initialize ppo tb_dir = os.path.join(opt_args.save_path, "tensorboard") os.makedirs(tb_dir, exist_ok=True) verbose = 1 ppo = PPO(policy, train_env, learning_rate=opt_args.lr, n_steps=opt_args.n_steps, n_epochs=opt_args.n_epochs, gamma=opt_args.gamma, gae_lambda=0.95, clip_range=opt_args.clip_range, clip_range_vf=None, ent_coef=opt_args.ent_coef, vf_coef=opt_args.vf_coef, max_grad_norm=opt_args.max_grad_norm, tensorboard_log=tb_dir, verbose=verbose, seed=opt_args.seed) # load initial checkpoint if opt_args.load_path: ppo.load(os.path.join(opt_args.load_path, "ppo_gfootball.pt")) # start training ppo eval_dir = os.path.join(opt_args.save_path, "eval") os.makedirs(eval_dir, exist_ok=True) ppo.learn(opt_args.num_timesteps, log_interval=1, eval_env=eval_env, eval_freq=opt_args.save_interval, n_eval_episodes=10, eval_log_path=eval_dir) # save final checkpoint ppo.save(os.path.join(opt_args.save_path, "ppo_gfootball"))
def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) if args.stats_path is None: envs = VecNormalize(envs, norm_obs=True, clip_obs=np.inf, norm_reward=False, clip_reward=np.inf) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) print("Do random explorations to build running averages") envs.reset() for _ in tqdm(range(1000)): random_action = np.stack( [envs.action_space.sample() for _ in range(n_envs)]) envs.step(random_action) envs.training = False # freeze the running averages (what a terrible variable name...) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs, device=args.device) learner.learn(total_timesteps=args.total_timesteps, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs, device=args.device, target_kl=2e-2) if args.device == 'cpu': torch.cuda.empty_cache() learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def pybullet_example(): # PyBullet: Normalizing input features import pybullet_envs env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) # Automatically normalize the input features and reward. env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0) model = PPO("MlpPolicy", env) model.learn(total_timesteps=2000) # Don't forget to save the VecNormalize statistics when saving the agent. log_dir = "/tmp/" model.save(log_dir + "ppo_halfcheetah") stats_path = os.path.join(log_dir, "vec_normalize.pkl") env.save(stats_path) # To demonstrate loading. del model, env # Load the saved statistics. env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) env = VecNormalize.load(stats_path, env) # Do not update them at test time. env.training = False # reward normalization is not needed at test time. env.norm_reward = False # Load the agent. model = PPO.load(log_dir + "ppo_halfcheetah", env=env)
def main(): env = Pinokio2() # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) if os.path.exists(save_file): model = PPO.load(save_file, env=DummyVecEnv([lambda: env])) else: model = PPO(MlpPolicy, env, verbose=1) while True: #model.learn(total_timesteps=10000) model.learn(total_timesteps=100000) model.save(save_file) obs = env.reset() for i in range(10): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) env.render() if done: print("resetting because " + str(done)) env.reset()
def simulate_and_save(args): config = gym_rl_mpc.SCENARIOS[args.env]['config'].copy() if args.psf: config['use_psf'] = True print("Using PSF corrected actions") config['wind_mean'] = args.wind_mean if not hasattr(args, 'save_sim_data'): args.save_sim_data = True env = gym.make(args.env, env_config=config) env_id = env.unwrapped.spec.id agent_path = args.agent agent = PPO.load(agent_path) sim_df = simulate_episode(env=env, agent=agent, max_time=args.time) if args.save_sim_data: agent_path_list = agent_path.split("\\") simdata_dir = os.path.join("logs", agent_path_list[-4], agent_path_list[-3], "sim_data") os.makedirs(simdata_dir, exist_ok=True) # Save file to logs\env_id\<EXPERIMENT_ID>\sim_data\<agent_file_name>_simdata.csv i = 0 while os.path.exists( os.path.join( simdata_dir, env_id + "_" + agent_path_list[-1][0:-4] + f"_simdata_{i}.csv")): i += 1 sim_df.to_csv( os.path.join( simdata_dir, env_id + "_" + agent_path_list[-1][0:-4] + f"_simdata_{i}.csv")) return sim_df, env
def main(): tensorboard_log = "./log" env = Pinokio3() # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) if os.path.exists( save_file ): model = PPO.load( save_file, env=DummyVecEnv([lambda:env]),tensorboard_log=tensorboard_log ) else: policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=net_arch) model = PPO(MlpPolicy, DummyVecEnv([lambda:env]), verbose=1,tensorboard_log=tensorboard_log) #https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./checkpoints/', name_prefix='pinokio3') while True: model.learn(total_timesteps=15000000, callback=checkpoint_callback, tb_log_name=tb_log_name ) model.save( save_file ) print( "saved" ) obs = env.reset() for i in range(20): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) print( "action {} -> reward {}".format( env.decode_action(action), reward ) ) env.render() if done: print( "resetting because " + str(done) ) env.reset()
def create_stable_baselines3_agent(agent_path, agent_type): """ Load and return a stable-baselines3 agent. The agent has a function `get_action` that takes in an observation and returns an appropiate action. `agent_type` is the algorithm name (only PPO-SB3 supported) """ from stable_baselines3 import PPO import torch agent = None if agent_type == "SB3-PPO": if "bc_models" in agent_path: # Only stores policy parameters. # Force on CPU (codebase-level heuristic that everything runs on CPU) agent = torch.load(agent_path, map_location="cpu") agent.get_action = lambda obs: agent.predict(obs)[0] else: # GAIL: Stores the whole agent agent = PPO.load(agent_path) agent.get_action = lambda obs: agent.predict(obs)[0] else: raise RuntimeError("Unknown agent type for SB3: {}".format(agent_type)) return agent
def main(): tensorboard_log = "./log" env = Pinokio5() # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) if os.path.exists(save_file): model = PPO.load(save_file, env=DummyVecEnv([lambda: env]), tensorboard_log=tensorboard_log) else: model = PPO(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log) try: while True: #model.learn(total_timesteps=10000) model.learn(total_timesteps=8000000, tb_log_name=tb_log_name) model.save(save_file) obs = env.reset() for i in range(100): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) env.render() if done: print("resetting because " + str(done)) env.reset() except KeyboardInterrupt: print("Saving before exiting...") model.save(save_file) print("k bye")
def create_video(env, savepoint="random", out_filename="video.mp4", video_size=(1230, 900)): if savepoint not in ["random", "argmax", None]: model = PPO.load(savepoint) # Videosettings fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = cv2.VideoWriter(out_filename, fourcc, 10, video_size) observation = env.reset() progress = tqdm.tqdm(total=env.horizon) done = False while not done: progress.update(env.action_frequency) # Get Action if savepoint == "random": action = env.action_space.sample() elif savepoint == "argmax": action = np.argmax(observation) else: action, _ = model.predict(observation) observation, reward, done, info = env.step(action) img = env.render("rgb_array") resized_img = cv2.resize(img, video_size, cv2.INTER_NEAREST) out.write(np.asarray(resized_img * 255, dtype=np.uint8)) out.release() progress.close() return out
def test(seed, model_filename, vec_filename, train, test, body_info=0, render=False): print("Testing:") print(f" Seed {seed}, model {model_filename} vec {vec_filename}") print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}") eval_env = utils.make_env(render=render, robot_body=test, body_info=body_info) eval_env = DummyVecEnv([eval_env]) eval_env = VecNormalize.load(vec_filename, eval_env) eval_env.norm_reward = False eval_env.seed(seed) model = PPO.load(model_filename) obs = eval_env.reset() if render: eval_env.env_method("set_view") distance_x = 0 # print(obs) total_reward = 0 for step in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_env.step(action) if done: break else: # the last observation will be after reset, so skip the last distance_x = eval_env.envs[0].robot.body_xyz[0] total_reward += reward[0] if render: time.sleep(0.01) eval_env.close() print(f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}") return total_reward, distance_x
def train_policy_ppo(path='policy_ppo', org_path='prob_ppo'): """ 学習済み方策をつかった環境を相手にトレーニングを行う 引数: path 学習済みモデルファイルパス org_path 学習元となる方策がロードする学習済みモデルファイルパス """ print(f'train ppo with prob_player path={path}, org_path={org_path}') # 学習済みモデルファイルのロード model = PPO.load(org_path) # じゃんけん環境の構築 env = RockPaperScissorsEnv(AIPlayer(model)) env = Monitor(env, LOGDIR, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # モデルのセット model.set_env(env) # トレーニング実行 elapsed = time.time() model.learn(total_timesteps=1000000) print(f'elapse time: {time.time() - elapsed}sec') # 学習済みモデルの保存 model.save(path) # じゃんけん環境のクローズ env.close()
def trained_agent(episodes=256, continuous=True, load=None, save_name="test", ent_coef=0.00001, total_timesteps=25000, learning_rate=lr()): env = gym.make("bilboquet-v0", continuous=continuous, amplitude=10) env.reset((300, 300)) if load is None: model = PPO('MlpPolicy', env, verbose=1, ent_coef=ent_coef, learning_rate=learning_rate, tensorboard_log=f"./ppo_bilboquet_tensorboard/") model.learn(total_timesteps=total_timesteps, tb_log_name=save_name) model.save(save_name + '.zip') print('DONE') obs = env.reset() else: model = PPO.load(load) obs = env.reset() for i in range(episodes): action, _states = model.predict(obs, deterministic=True) # print(action) obs, reward, done, info = env.step(action) # print(reward) env.render() if done: obs = env.reset()
def main(args): expert = None expert_state_dim = 0 if args.policy_path is not None: policy_path = args.policy_path expert = PPO.load(policy_path) expert_state_dim = expert.observation_space.shape[0] factory = EnvFactory(args.env) env = DummyVecEnv([factory.make_env]) if args.stats_path is not None: env = VecNormalize.load(args.stats_path, env) env.training = False else: env = VecNormalize(env, training=False) obs = env.reset() env.render() total_reward = 0 while True: if expert is None: action = env.action_space.sample() action = np.zeros_like(action) else: good_obs = obs[:, :expert_state_dim] action, _ = expert.predict(good_obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() reward = env.get_original_reward() total_reward += reward[0] if done: print("Total reward: {:.3f}".format(total_reward)) obs = env.reset() total_reward = 0
def main(): test_or_train = TEST_OR_TRAIN assert test_or_train in ["train", "test"] gym_config = SimulationParameters(time_step=TIME_STEP) robot_class = QuadrupedRobot robot_params = MiniCheetahParams( on_rack=False, enable_self_collision=True, motor_control_mode=MotorControlMode.HYBRID_COMPUTED_POS_TROT) task = TestTask(train_or_test=TEST_OR_TRAIN) env = LocomotionGymEnv(gym_config, robot_class, robot_params, task) policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/policies') if not (os.path.exists(policy_save_dir)): os.makedirs(policy_save_dir) policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") policy_save_path = os.path.join(policy_save_dir, policy_save_filename) if TEST_OR_TRAIN == "train": model = PPO('MlpPolicy', env, verbose=1) model.learn(total_timesteps=100000000) model.save(policy_save_path) else: model = PPO.load(POLICY_SAVE_PATH) obs = env.reset() while True: action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
def main(): # multiprocess environment # n_cpu = 8 # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)]) # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) n_cpu = 1 env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) model = PPO('MlpPolicy', env, verbose=1, n_steps=int(4096 / n_cpu), wandb_use=False) model.learn(total_timesteps=40000000) file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now()) model.save(file_name) env.save(file_name + "_env.pkl") model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) del model # remove to demonstrate saving and loading del env # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089" env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize.load(file_name + "_env.pkl", env) env.training = False model = PPO.load(file_name, env=env, wandb_use=False) #Enjoy trained agent obs = np.copy(env.reset()) epi_reward = 0 while True: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render() epi_reward += rewards if dones: print("Episode Reward: ", epi_reward) epi_reward = 0
def train_stable_baselines(submodule, flags): """Train policies using the PPO algorithm in stable-baselines.""" from stable_baselines3.common.vec_env import DummyVecEnv flow_params = submodule.flow_params # Path to the saved files exp_tag = flow_params['exp_tag'] result_name = '{}/{}'.format(exp_tag, strftime("%Y-%m-%d-%H:%M:%S")) # Perform training. start_time = timeit.default_timer() # print experiment.json information print("=========================================") print('Beginning training.') print('Algorithm :', flags.algorithm) model = run_model_stablebaseline(flow_params, flags.num_cpus, flags.rollout_size, flags.num_steps, flags.algorithm, flags.exp_config) stop_time = timeit.default_timer() run_time = stop_time - start_time print("Training is Finished") print("total runtime: ", run_time) # Save the model to a desired folder and then delete it to demonstrate # loading. print('Saving the trained model!') path = os.path.realpath(os.path.expanduser('~/baseline_results')) ensure_dir(path) save_path = os.path.join(path, result_name) model.save(save_path) # dump the flow params with open(os.path.join(path, result_name) + '.json', 'w') as outfile: json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4) # Replay the result by loading the model print('Loading the trained model and testing it out!') if flags.exp_config.lower() == "ppo": from stable_baselines3 import PPO model = PPO.load(save_path) elif flags.exp_config.lower() == "ddpg": from stable_baselines3 import DDPG model = DDPG.load(save_path) flow_params = get_flow_params(os.path.join(path, result_name) + '.json') flow_params['sim'].render = True env = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run eval_env = DummyVecEnv([lambda: env]) obs = eval_env.reset() reward = 0 for _ in range(flow_params['env'].horizon): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) reward += rewards print('the final reward is {}'.format(reward))
def __init__(self, algorithm: str, checkpoint_path: str): if algorithm == 'ppo': policy = PPO.load(checkpoint_path) elif algorithm == 'sac': policy = SAC.load(checkpoint_path) else: raise NotImplementedError self._model = policy
def __init__(self): config.loads('config.json') self.asset = 10000 self.backtest = BackTest() # data = Market.kline('sh600519', '1d') # print(data) ltdxhq = LTdxHq() # df = ltdxhq.get_k_data_daily('603636', start='2021-09-01') # 000032 300142 603636 600519 df = ltdxhq.get_k_data_1min('000032', start='2021-08-31') # 000032 300142 603636 600519 df = StockDataFrame(df) ltdxhq.close() # print(df.head()) self.kline = [] self.buy_signal = [] self.sell_signal = [] # 2005-08-11 15:00 # open 46.01 # close 47.37 # high 47.40 # low 46.01 # vol 1359360.00 # amount 63589532.00 data = [] for index, row in df.iterrows(): data.append([index[:10], row.open, row.high, row.low, row.close, row.vol,]) self.model = PPO.load('ppo_stock') for current_step in range(240, df.shape[0]): obs = np.array([ df.iloc[current_step - NEXT_OBSERVATION_SIZE: current_step]['open'].values / MAX_SHARE_PRICE, df.iloc[current_step - NEXT_OBSERVATION_SIZE: current_step]['high'].values / MAX_SHARE_PRICE, df.iloc[current_step - NEXT_OBSERVATION_SIZE: current_step]['low'].values / MAX_SHARE_PRICE, df.iloc[current_step - NEXT_OBSERVATION_SIZE: current_step]['close'].values / MAX_SHARE_PRICE, df.iloc[current_step - NEXT_OBSERVATION_SIZE: current_step ]['vol'].values / MAX_NUM_SHARES, # df['close'].pct_change().fillna(0)[current_step: current_step + NEXT_OBSERVATION_SIZE], df['macd'][current_step - NEXT_OBSERVATION_SIZE: current_step].values, df['macdh'][current_step - NEXT_OBSERVATION_SIZE: current_step].values, df['macds'][current_step - NEXT_OBSERVATION_SIZE: current_step].values, df['kdjk'][current_step - NEXT_OBSERVATION_SIZE: current_step].values, df['kdjd'][current_step - NEXT_OBSERVATION_SIZE: current_step].values, df['kdjj'][current_step - NEXT_OBSERVATION_SIZE: current_step].values, df['rsi_6'][current_step - NEXT_OBSERVATION_SIZE: current_step].fillna(0).values, df['rsi_12'][current_step - NEXT_OBSERVATION_SIZE: current_step].fillna(0).values, ]) # df.index.values[current_step][:10] self.kline.append([df.index.get_level_values(level=1)[current_step], df.iloc[current_step].open, df.iloc[current_step].high, df.iloc[current_step].low, df.iloc[current_step].close, df.iloc[current_step].vol]) self.backtest.initialize(self.kline, data) self.begin(obs) # print(self.buy_signal) # print(self.sell_signal) plot_asset()
def load(self, name: str, env, replace_parameters=None): self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace( ":", "-") os.makedirs(self.log_dir, exist_ok=True) monitor_env = Monitor(env, self.log_dir, allow_early_resets=True) vec_env = DummyVecEnv([lambda: monitor_env]) self.model = PPO.load(name, env=vec_env, custom_objects=replace_parameters)
def main(): # Create the callback: check every 1000 steps log_dir = 'log' callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) num_cpu = 16 model_stats_path = os.path.join(log_dir, "sac_" + env_name) env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl') tb_log = 'tb_log' videoName = '5M_timesteps_sac' tb_log_name = videoName if(StartFresh): # env = make_vec_env(env_name, n_envs=4) # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch':[128,64,32], } model = PPO('MlpPolicy', env, learning_rate = 0.001, n_steps=500, # batch_size=0, # n_epochs=1, gamma=0.9, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize.load(env_stats_path, env) env.reset() model = PPO.load(model_stats_path, tensorboard_log=tb_log) model.set_env(env) if(DoTraining): eval_env = make_vec_env(env_name, n_envs=1) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log) model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback() # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): # mean_reward, std_reward = evaluate_policy(model, eval_env) # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}") record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
def run_model(args): env = TankEnv(args.game_path, opp_fp_and_elo=[(args.opp, 1000)], game_port=args.base_port, my_port=args.my_port, image_based=args.image_based, level_path=args.level_path, rand_opp=args.rand_opp, p=args.env_p, opp_p=args.opp_env_p) model = None if args.p1: model = PPO.load(args.p1) elif args.p1same: model = PPO.load(args.opp) score = [0, 0, 0] print("Score: [Player1 Wins, Player2 Wins, Ties]") obs = env.reset() if args.image_based and (args.ai_view or args.rev_ai_view): fig = plt.gcf() fig.show() fig.canvas.draw() while True: if args.image_based and (args.ai_view or args.rev_ai_view): if not args.rev_ai_view: plt.imshow(obs, origin="lower") else: plt.imshow(env.opp_state, origin="lower") fig.canvas.draw() if model: action, _ = model.predict(obs) elif args.rand_p1: action = np.random.rand(5) * 2 - 1 else: action = np.zeros(5, dtype=np.float32) obs, reward, done, info = env.step(action) if done: score[info["winner"]] += 1 print("Score:", score) obs = env.reset()
def __init__(self): _path = pathlib.Path(__file__).parent.resolve() custom_objects = { "lr_schedule": 0.00001, "clip_range": .02, "n_envs": 1, "device": "cpu" } sys.path.append(_path) self.actor = PPO.load(str(_path)+'/monkey_mdl.zip', custom_objects=custom_objects)
def load_model(run_name: str, model_file: str) -> Tuple[VecEnv, PPO]: run_dir = get_run_dir(run_name) cfg = load(run_dir)['preprocess'] env = make_env(seed=123, n_envs=1, run_dir=run_dir, frame_skip=cfg['frame_skip'], frame_stack=cfg['frame_stack'], is_eval=True) model = PPO.load(os.path.join(run_dir, model_file)) return env, model
def change_model(self): path = self.save_path try: files = [join(path, f) for f in listdir(path) if isfile(join(path, f))] # files = sorted(files, key=getmtime, reverse=True) # model_name = files[random.randrange(min(len(files), 5))] model_name = max(files, key=getmtime) self.past_models[self.change_index] = PPO.load(model_name) self.change_index = (self.change_index + 1) % len(self.past_models) except Exception as e: print(e)
def __init__(self): _path = pathlib.Path(__file__).parent.resolve() custom_objects = { "lr_schedule": 0.000001, "clip_range": .02, "n_envs": 1, } self.actor = PPO.load(str(_path) + '/example_mdl', device='cpu', custom_objects=custom_objects) self.parser = DiscreteAction()
def test(MODEL_TEST): log_dir = "model_save/" + MODEL_PATH + "/" + MODEL_PATH + MODEL_TEST env = ENV(util='test', par=PARAM, dt=DT) env.render = True env = Monitor(env, log_dir) if PARAM['algo']=='td3': model = TD3.load(log_dir) elif PARAM['algo']=='ddpg': model = DDPG.load(log_dir) elif PARAM['algo']=='ppo': model = PPO.load(log_dir) # plot_results(f"model_save/") trade_dt = pd.DataFrame([]) # trade_dt: 所有股票的交易数据 result_dt = pd.DataFrame([]) # result_dt: 所有股票一年测试结果数据 for i in range(TEST_STOCK_NUM): state = env.reset() stock_bh_id = 'stock_bh_'+str(i) # 记录每个股票交易的buy_hold stock_port_id = 'stock_port_'+str(i) # 记录每个股票交易的portfolio stock_action_id = 'stock_action_' + str(i) # 记录每个股票交易的action flow_L_id = 'stock_flow_' + str(i) # 记录每个股票的流水 stock_bh_dt, stock_port_dt, action_policy_dt, flow_L_dt = [], [], [], [] day = 0 while True: action = model.predict(state) next_state, reward, done, info = env.step(action[0]) state = next_state # print("trying:",day,"reward:", reward,"now profit:",env.profit) # 测试每一步的交易policy stock_bh_dt.append(env.buy_hold) stock_port_dt.append(env.Portfolio_unit) action_policy_dt.append(action[0][0]) # 用于记录policy flow_L_dt.append(env.flow) day+=1 if done: print('stock: {}, total profit: {:.2f}%, buy hold: {:.2f}%, sp: {:.4f}, mdd: {:.2f}%, romad: {:.4f}' .format(i, env.profit*100, env.buy_hold*100, env.sp, env.mdd*100, env.romad)) # 交易完后记录:股票ID,利润(单位100%),buy_hold(单位100%),夏普率,最大回撤率(单位100%),romad result=pd.DataFrame([[i,env.profit*100,env.buy_hold*100,env.sp,env.mdd*100,env.romad]]) break trade_dt_stock = pd.DataFrame({stock_port_id: stock_port_dt, stock_bh_id: stock_bh_dt, stock_action_id: action_policy_dt, flow_L_id: flow_L_dt}) # 支股票的交易数据 trade_dt = pd.concat([trade_dt, trade_dt_stock], axis=1) # 所有股票交易数据合并(加行) result_dt = pd.concat([result_dt,result],axis=0) # 所有股票结果数据合并(加列) result_dt.columns = ['stock_id','prfit(100%)','buy_hold(100%)','sp','mdd(100%)','romad'] trade_dt.to_csv('out_dt/trade_'+MODEL_PATH+'.csv',index=False) result_dt.to_csv('out_dt/result_'+MODEL_PATH+'.csv',index=False)
def ai_eval(): env = Snake_Env(server=False) model = PPO.load("./positivereward", env=env) obs = env.reset() for i in range(1000): action, _state = model.predict(obs, deterministic=True) #action = env.action_space.sample() #print(action) obs, reward, done, info = env.step(action) env.render() if done: env.reset()
def play(env_name, load_file, total_timesteps): env = DummyVecEnv([lambda: gym.make(env_name)]) model = PPO.load(load_file, verbose=1) obs = env.reset() for i in range(total_timesteps): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) # env.render() # dummy if done: print(info[0]['episode']) del model env.close()