def main(): env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) if not USE_LOADED_MODEL: model = ACKTR('MlpPolicy', env, verbose=1) # Multiprocessed RL Training start_time = time.time() model.learn(total_timesteps=n_timesteps, log_interval=10) total_time_multi = time.time() - start_time model.save("cartpole_v1_acktr") loaded_model = ACKTR.load("cartpole_v1_acktr") loaded_model.set_env(env) # Single Process RL Training single_process_model = ACKTR('MlpPolicy', env_id, verbose=1) start_time = time.time() single_process_model.learn(n_timesteps) total_time_single = time.time() - start_time print("Single-process: {0}s, Multi-process: {1}s".format( total_time_single, total_time_multi)) # create separate clean environment for evaluation eval_env = gym.make(env_id) mean_reward, std_reward = evaluate_policy(loaded_model, eval_env, n_eval_episodes=10) print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')
def train(env_id, num_timesteps, seed, num_cpu): """ train an ACKTR model on atari :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param num_cpu: (int) The number of cpu to train on """ env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) model = ACKTR(CnnPolicy, env, nprocs=num_cpu) model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) env.close()
def run(): torch.multiprocessing.freeze_support() env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) model = ACKTR(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action)
def acktr(env_id, log_dir, timesteps): # Create log dir os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_id) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = ACKTR(MlpPolicy, env, verbose=0) # Train the agent print("Beginning training episodes with ACKTR.") model.learn(total_timesteps=timesteps) env.close()
def optimize_agent(trial): agent = PPO2 policy = MlpLstmPolicy train_env, test_env = optimize_envs(trial) if agent == ACKTR: params = optimize_acktr(trial) model = ACKTR(policy, train_env, verbose=1, tensorboard_log="./tensorboard", **params) elif agent == PPO2: params = optimize_ppo2(trial) model = PPO2(policy, train_env, verbose=1, nminibatches=1, tensorboard_log="./tensorboard", **params) model.test_env = test_env model.trial = trial try: model.learn(n_timesteps, callback=learn_callback) model.env.close() test_env.close() except AssertionError: # Sometimes, random hyperparams can generate NaN model.env.close() model.test_env.close() raise is_pruned = False cost = np.inf if hasattr(model, 'is_pruned'): is_pruned = model.is_pruned # pylint: disable=no-member cost = -1 * model.last_mean_test_reward # pylint: disable=no-member del model.env, model.test_env del model if is_pruned: raise optuna.structs.TrialPruned() return cost
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_acktr(trial) seed = trial.suggest_int('numpyseed', 1, 429496729) np.random.seed(seed) original_env = gym.make('rustyblocks-v0') original_env.max_invalid_tries = 3 env = DummyVecEnv([lambda: original_env]) model = ACKTR("MlpPolicy", env, nprocs=1, verbose=0, **model_params) print("DOING LEARING acer") original_env.force_progression = False model.learn(int(2e4), seed=seed) print("DONE LEARING acer") original_env.max_invalid_tries = -1 rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() original_env.force_progression = True original_env.invalid_try_limit = 5000 while n_episodes < 4: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(last_reward) return last_reward
def train_acktr(seed): """ test ACKTR on the uav_env(cartesian,discrete) """ """ ACKTR(policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False) """ algo = 'ACKTR' num_timesteps = 3000000 env = set_up_env(seed) global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 model = ACKTR(policy=MlpPolicy, env=env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo), _init_setup_model=True) # , async_eigen_decomp=False) model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed, log_interval=500, tb_log_name="seed_{}".format(seed)) model = ACKTR.load(log_dir + 'best_model.pkl') evaluation = evaluate_model(env, model, 100) os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True) os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed)) env.close() del model, env gc.collect() return evaluation
def stable_baseline_test(env_origin): env = make_vec_env(lambda: env_origin, n_envs=1) model = ACKTR('CnnPolicy', env_origin, verbose=1) model.learn(total_timesteps=2000000) print("Stable_baseline evaluation starts.....\n") #NOTE:evaluate_policy needs vec_env reward_mean, reward_std = evaluate_policy(model, env, n_eval_episodes=20, deterministic=False) print("mean reward:" + str(reward_mean) + '\n') print("reward std:" + str(reward_std) + '\n') print("custom evaluation begin\n") env = env_origin obs = env.reset() reward_list_total = [] epilen_list = [] reward_list = [] last_end = 0 for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) reward_list.append(rewards) if dones: obs = env.reset() epilen_list.append(i - last_end) last_end = i reward_list_total.append(np.sum(reward_list)) reward_list = [] if i > 900: break print("mean reward:{}\n".format(np.mean(reward_list_total))) print("mean epilen:{}\n".format(np.mean(epilen_list)))
# which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) scenario = str( f'{inputfile_s}_t{test}_lr{LR_s}_gamma{gamma_s}_batch{batch_size}') callbacklist = CallbackList([ TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=scenario, n_eval_episodes=5) ]) model = ACKTR(MlpPolicy, env, gamma=gamma, n_steps=batch_size, learning_rate=LR, verbose=1) #, tensorboard_log=scenario) model.learn(total_timesteps=episodetimesteps**99, callback=callbacklist) filename = './%s/evaluations.npz' % scenario data = np.load(filename) y = np.average(results, axis=1) y = results[:, 0] timesteps = data['timesteps'] plt.plot(timesteps, y) plt.xlabel('Timesteps') plt.ylabel('Score') #plt.show() savepath = './%s/fig_%s' % (scenario, scenario) plt.savefig(savepath)
n_steps += 1 # Returning False will stop training early return True env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) if os.path.isfile(model_file): model = ACKTR.load(model_file, env=env) else: model = ACKTR( MlpLnLstmPolicy, env, tensorboard_log=f"./test{base_test_file}/", verbose=0 ) # add tensorboard_log="./test/" and run tensorboard --logdir /Users/constantin/Documents/bn/rl/test/PPO2_1 model.learn(total_timesteps=10**5, callback=callback) # def evaluate(model, num_steps=1000): # obs = env.reset() # for i in range(num_steps): # # _states are only useful when using LSTM policies # action, _states = model.predict(obs) # # obs, reward, done, info = env.step(action) # env.render() # # # model = PPO2.load("/home/constantin/Desktop/projects/disertation/rl_logs_1_1-20200120T201830Z-001/rl_logs_1_1/1_best_model399.pkl") # evaluate(model, 30)
if (n_steps + 1) % 100000 == 0: print("Saving checkpoint model") _locals['self'].save(model_dir + 'model_{}_steps.pkl'.format(n_steps + 1)) n_steps += 1 return True print('Starting Training') """ ACKTR(policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False, policy_kwargs=None, full_tensorboard_log=False) """ model = ACKTR(policy=MlpPolicy, env=env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True) model.learn(total_timesteps=num_timesteps, callback=custom_callback, seed=seed, log_interval=100) print('Starting evaluation') env = setup_env_cart_discrete(seed, log_dir) model.set_env(env) get_trajectories(model, trajectory_dir, n_trajectories=100)
os.makedirs(tensorboard_folder) if not os.path.isdir(model_folder): os.makedirs(model_folder) policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: BaseEnv(10, 10)]) model = ACKTR(get_policy(policy), env, verbose=0, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=10000000, tb_log_name='ACKTR_A2C' + model_tag) model.save(model_folder + "ACKTR_A2C" + model_tag) del model model = ACKTR.load(model_folder + "ACKTR_A2C" + model_tag) done = False states = None obs = env.reset() while not done: action, states = model.predict(obs, states) obs, _, done, info = env.step(action) env.render()
import gym gym.logger.set_level(40) import tensorflow as tf tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) from env import GoLeftEnv from stable_baselines import DQN, PPO2, A2C, ACKTR from stable_baselines.common.cmd_util import make_vec_env from stable_baselines.common.evaluation import evaluate_policy from stable_baselines.common.vec_env import VecVideoRecorder, DummyVecEnv from stable_baselines.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold env = GoLeftEnv(grid_size=10) env = make_vec_env(lambda: env, n_envs=1) callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=0.9, verbose=1) eval_callback = EvalCallback(env, callback_on_new_best=callback_on_best, verbose=1) model = ACKTR('MlpPolicy', env, verbose=1) model.learn(int(1e10), callback=eval_callback) mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100) print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") model.save('models/best') env.close()
# This function will only work for a single Environment env = model.get_env() all_episode_rewards = [] for i in range(num_episodes): episode_rewards = [] done = False obs = env.reset() while not done: # _states are only useful when using LSTM policies action, _states = model.predict(obs) # here, action, rewards and dones are arrays # because we are using vectorized env obs, reward, done, info = env.step(action) episode_rewards.append(reward) print("Episode", i, "Reward:", sum(episode_rewards)) all_episode_rewards.append(sum(episode_rewards)) mean_episode_reward = np.mean(all_episode_rewards) min_episode_reward = np.min(all_episode_rewards) print("Mean reward:", mean_episode_reward, "Min reward:", min_episode_reward, "Num episodes:", num_episodes) return mean_episode_reward # Test the trained agent evaluate(model, num_episodes=100) model.learn(500) evaluate(model, num_episodes=100)
os.makedirs(tensorboard_folder) if not os.path.isdir(model_folder): os.makedirs(model_folder) policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: NegativeRewardEnv(map_name='map1')]) model = ACKTR(get_policy(policy), env, verbose=0, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=2500000, tb_log_name='ACKTR_A2C_map1' + model_tag) model.save(model_folder + "ACKTR_A2C_map1" + model_tag) del model model = ACKTR.load(model_folder + "ACKTR_A2C_map1" + model_tag) done = False states = None obs = env.reset() while not done: action, states = model.predict(obs, states) obs, _, done, info = env.step(action) env.render()
# model.cliprange = stable_baselines.common.schedules.LinearSchedule(1.0, 0.2, initial_p=0).value model.learn(total_timesteps=1000000, reset_num_timesteps=False, callback=callback) model.save(log_dir + 'model_PPO_' + str(id + 1)) if args.algo == "acktr": id = balboa.utils.tensorboard_latest_directory_number( log_dir, 'ACKTR_') print('Using acktr') if args.load_id == None: # tensorboard_log=log_dir model = ACKTR("MlpPolicy", env, policy_kwargs=policy_kwargs, ent_coef=0.0, verbose=1) # verbose=1, n_steps=48, learning_rate=0.1, lr_schedule='constant', else: print("Loading model: " + str(args.load_id)) model = ACKTR.load(log_dir + 'ACKTR_' + str(args.load_id) + ".zip", env=env) model.tensorboard_log = log_dir # model.learning_rate = stable_baselines.common.schedules.LinearSchedule(1.0, 0.06, initial_p=0.06).value # model.cliprange = stable_baselines.common.schedules.LinearSchedule(1.0, 0.2, initial_p=0).value model.learn(total_timesteps=3000000, reset_num_timesteps=False, callback=callback) print("Saving to: " + log_dir + 'ACKTR_' + str(id + 1)) model.save(log_dir + 'model_ACKTR_' + str(id + 1))
gamma=config['gamma'], policy_kwargs=config['policy_kwargs'], verbose=1, tensorboard_log=save_path) elif config['algorithm'] == 'PPO2': env = make_vec_env(lambda: env, n_envs=1) model = PPO2(config['policy_network'], env, learning_rate=config['learning_rate'], gamma=config['gamma'], policy_kwargs=config['policy_kwargs'], verbose=1, tensorboard_log=save_path) elif config['algorithm'] == 'DQN': model = DQN( config['policy_network'], env, learning_rate=config['learning_rate'], buffer_size=config['buffer_size'], target_network_update_freq=64, gamma=config['gamma'], # policy_kwargs = config['policy_kwargs'], verbose=1, tensorboard_log=save_path) model.learn(config['total_steps'], callback=callback) model.save(os.path.join(save_path, 'model')) env.close()
os.makedirs(model_folder) policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: ActionMaskEnv()]) model = ACKTR(get_policy(policy), env, verbose=0, gae_lambda=0.95, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=2500000, tb_log_name='ACKTR_PPO2' + model_tag) model.save(model_folder + "ACKTR_PPO2" + model_tag) del model model = ACKTR.load(model_folder + "ACKTR_PPO2" + model_tag) done = False states = None action_masks = [] obs = env.reset() while not done: action, states = model.predict(obs, states, action_mask=action_masks) obs, _, done, infos = env.step(action) env.render() action_masks.clear()
def train(environment, algorithm, timesteps): from envs import cpa, mountain_car from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.bench import Monitor from stable_baselines import PPO2, ACKTR, DQN, A2C now = datetime.now() current_time = now.strftime("%Y-%m-%d-%H-%M-%S") training_info_dir = "training_info" + os.path.sep current_training_info = "{}-{}-{}".format(current_time, algorithm, environment) current_training_info_dir = training_info_dir + current_training_info + os.path.sep model_file_path = current_training_info_dir + "model" log_file_path = current_training_info_dir + "monitor.csv" tensorboard_dir = training_info_dir + TENSORBOARD_DIR_NAME + os.path.sep dirs_to_create = [model_file_path, tensorboard_dir, model_file_path] for directory in dirs_to_create: create_dir(directory) env = None if environment == 'cpa_sparse': env = cpa.CPAEnvSparse() elif environment == 'cpa_dense': env = cpa.CPAEnvDense() elif environment == 'mc_sparse': env = mountain_car.MountainCarSparseEnv() elif environment == 'mc_dense': env = mountain_car.MountainCarDenseEnv() else: raise Exception("Environment '{}' is unknown.".format(environment)) # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor env = Monitor(env, filename=log_file_path, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = None if algorithm == 'acktr': model = ACKTR('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir) elif algorithm == 'ppo': model = PPO2('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir) elif algorithm == 'a2c': model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir) elif algorithm == 'dqn': model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir) else: raise Exception("Algorithm '{}' is unknown.".format(algorithm)) # Train the agent model.learn(total_timesteps=timesteps, tb_log_name=current_training_info) model.save(model_file_path) print("Finished training model: {}. Saved training info in: {}".format(model, current_training_info_dir))
def test_action_mask_learn_acktr(vec_env, policy, env_class): env = vec_env([env_class]*2) model = ACKTR(policy, env, verbose=0) model.learn(total_timesteps=500) env.close()
class ACKTR_Agent: def __init__(self, params: Params): self.params: Params = params policy_name = self.params.agent_config['policy'] self.policy = eval(policy_name) def create_model(self, n_envs=1): """ Create env and agent model """ env_cls = SprEnv self.env = make_vec_env(env_cls, n_envs=n_envs, env_kwargs={"params": self.params}, seed=self.params.seed) self.model = ACKTR( self.policy, self.env, gamma=self.params.agent_config['gamma'], n_steps=self.params.agent_config['n_steps'], ent_coef=self.params.agent_config['ent_coef'], vf_coef=self.params.agent_config['vf_coef'], vf_fisher_coef=self.params.agent_config['vf_fisher_coef'], max_grad_norm=self.params.agent_config['max_grad_norm'], learning_rate=self.params.agent_config['learning_rate'], gae_lambda=self.params.agent_config['gae_lambda'], lr_schedule=self.params.agent_config['lr_schedule'], kfac_clip=self.params.agent_config['kfac_clip'], kfac_update=self.params.agent_config['kfac_update'], async_eigen_decomp=self.params.agent_config['async_eigen_decomp'], verbose=self.params.agent_config['verbose'], tensorboard_log="./tb/acktr/", seed=self.params.seed, policy_kwargs={"params": self.params}) def train(self): with ProgressBarManager(self.params.training_duration) as callback: self.model.learn(total_timesteps=self.params.training_duration, tb_log_name=self.params.tb_log_name, callback=callback) def test(self): self.params.test_mode = True obs = self.env.reset() self.setup_writer() episode = 1 step = 0 episode_reward = [0.0] done = False # Test for 1 episode while not done: action, _states = self.model.predict(obs) obs, reward, dones, info = self.env.step(action) episode_reward[episode - 1] += reward[0] if info[0]['sim_time'] >= self.params.testing_duration: done = True self.write_reward(episode, episode_reward[episode - 1]) episode += 1 sys.stdout.write( "\rTesting:" + f"Current Simulator Time: {info[0]['sim_time']}. Testing duration: {self.params.testing_duration}" ) sys.stdout.flush() step += 1 print("") def save_model(self): """ Save the model to a zip archive """ self.model.save(self.params.model_path) def load_model(self, path=None): """ Load the model from a zip archive """ if path is not None: self.model = ACKTR.load(path) else: self.model = ACKTR.load(self.params.model_path) # Copy the model to the new directory self.model.save(self.params.model_path) def setup_writer(self): episode_reward_filename = f"{self.params.result_dir}/episode_reward.csv" episode_reward_header = ['episode', 'reward'] self.episode_reward_stream = open(episode_reward_filename, 'a+', newline='') self.episode_reward_writer = csv.writer(self.episode_reward_stream) self.episode_reward_writer.writerow(episode_reward_header) def write_reward(self, episode, reward): self.episode_reward_writer.writerow([episode, reward])
n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = round(np.mean(mean_rewards), 1) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) return mean_reward # Random Agent, before training mean_reward_before_train = evaluate(model, num_steps=1000) n_timesteps = 25000 # Multiprocessed RL Training start_time = time.time() model.learn(n_timesteps) total_time_multi = time.time() - start_time print("Took {:.2f}s for multiprocessed version - {:.2f} FPS".format( total_time_multi, n_timesteps / total_time_multi)) # Evaluate the trained agent mean_reward = evaluate(model, num_steps=10000) # Single Process RL Training single_process_model = ACKTR(MlpPolicy, DummyVecEnv([lambda: gym.make(env_id)]), verbose=0) start_time = time.time() single_process_model.learn(n_timesteps)
import gym from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import ACKTR # multiprocess environment n_cpu = 4 env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)]) model = ACKTR(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) model.save("acktr_cartpole") del model # remove to demonstrate saving and loading model = ACKTR.load("acktr_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
#env = CustomEnv(3, 6, "tcp://*:5556") # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) # Create log dir log_dir = "Logs/Custom_env/" os.makedirs(log_dir, exist_ok=True) # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=500, log_dir=log_dir) #env = Monitor(env, log_dir) model = ACKTR(MlpPolicy, env, verbose=2) #model.load("DQN_agent") model.learn(total_timesteps=20000, callback=callback) model.save("temp_agent") a = input("Training completed") obs = env.reset() for _ in range(1000): action, _states = model.predict(obs, deterministic=True) probs = model.action_probability(obs) obs, rewards, dones, info = env.step(action) print("Observation:", obs, rewards, probs) results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "Lane Manager") plt.show()
episode_rewards = [] done = False obs = env.reset() env.render() while not done: # _states are only useful when using LSTM policies action, _states = model.predict(obs) # here, action, rewards and dones are arrays # because we are using vectorized env obs, reward, done, info = env.step(action) print(reward) env.render() episode_rewards.append(reward) print("Episode", i, "Reward:", sum(episode_rewards)) all_episode_rewards.append(sum(episode_rewards)) mean_episode_reward = np.mean(all_episode_rewards) min_episode_reward = np.min(all_episode_rewards) print("Mean reward:", mean_episode_reward, "Min reward:", min_episode_reward, "Num episodes:", num_episodes) return mean_episode_reward # Test the trained agent # evaluate(model, num_episodes=100) # evaluate(model, num_episodes=5) model.learn(100000) # evaluate(model, num_episodes=5)
# index = np.argmin(best_mean_reward) # if mean_reward > best_mean_reward[index]: # best_mean_reward[index] = mean_reward # print('best_mean_reward', best_mean_reward) # _locals['self'].save(log_dir + 'best_model_{}.pkl'.format(str(mean_reward))) # n_steps += 1 # return False # log_dir = 'LiveStream_1229/ACKTRCust3_deletem8_zhongwang_diff_delay/' log_dir = 'ACKTRtest/' if not os.path.exists(log_dir): os.makedirs(log_dir) os.environ['CUDA_VISIBLE_DEVICES'] = '1' tstart = time.time() num_cpu = 2 env = SubprocVecEnv([make_env(i, log_dir) for i in range(num_cpu)]) model = ACKTR( env=env, policy=LstmCust3Policy, verbose=1, ) model.learn(total_timesteps=int(5e6), callback=callback) model.save(log_dir + "last_model") print('Time taken: {:.2f}'.format(time.time() - tstart))