def attention_render(model_name, env_name, num_cpu, log_dir): if not os.path.exists(log_dir): raise ('log_dir not Exists') env_id = env_name + 'NoFrameskip-v4' env = SubprocVecEnv([make_env(env_id, i, log_dir) for i in range(num_cpu)]) # env = Monitor(env, log_dir, allow_early_resets=True) if model_name == 'A2C_Attention': model = A2C(AttentionPolicy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') elif model_name == 'A2C_Attention2': model = A2C(Attention2Policy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') elif model_name == 'A2C': model = A2C(LstmPolicy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') else: model = None model = model.load(log_dir + model_name + '_' + env_name, env=env) obs = env.reset() # print(env.observation_space) # cv2.imshow('test', RGB2BGR(obs[0])) # cv2.waitKey(0) while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) attentions = model.get_attention(obs, _states, done)[0] attentions_img = [] # print('attention', np.array(attention).shape) for i, attention in enumerate(attentions): attention = np.array(attention) attention = np.reshape(attention, [ env.observation_space.shape[0] // 10, env.observation_space.shape[1] // 10, 1 ]) attention = np.repeat(attention, [10] * attention.shape[0], axis=0) attention = np.repeat(attention, [10] * attention.shape[1], axis=1) attention = attention * 255 attentions_img.append(attention) # print(np.sum(attention)) attentions = tile_images(attentions_img) cv2.imshow('attention', attentions) cv2.waitKey(1) # break env.render() return model
def build_model(self): if self.is_stack: if self.game_type == "box": self.env = DummyVecEnv([lambda: self.env]) self.model = A2C(MlpPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr) if self.game_type == "atari": self.model = A2C(CnnPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr) else: if self.game_type == "box": self.env = DummyVecEnv([lambda: self.env]) self.model = A2C(MlpPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr) if self.game_type == "atari": self.model = A2C(CnnLstmPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr)
def get_model(model_name, env, log_dir): if model_name == "A2C_DualAttention": model = A2C(DualAttentionLstmPolicy, env, verbose=1) elif model_name == "A2C_SelfAttention_Cin": model = A2C(SelfAttentionCinLstmPolicy, env, verbose=1) elif model_name == "A2C_SelfAttention": model = A2C(SelfAttentionLstmPolicy, env, verbose=1) elif model_name == 'A2C_Attention': model = A2C(AttentionPolicy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') elif model_name == 'A2C_Attention2': model = A2C(Attention2Policy, env, verbose=1, tensorboard_log=log_dir + 'tensorboard/') elif model_name == 'A2C_Attention3': model = A2C(Attention3Policy, env, verbose=1) elif model_name == 'A2C_Attention4': model = A2C(Attention4Policy, env, verbose=1) elif model_name == 'A2C': model = A2C(CnnLstmPolicy, env, verbose=1) else: raise ('{} Not Exist'.format(model_name)) return model
def a2c(env, seed): return A2C('MlpPolicy', env, learning_rate=0.001, verbose=1, tensorboard_log="./data/runs", seed=seed)
def get_a2c(vec_env=None, policy='CnnPolicy', learning_rate=7e-4, momentum=0.0, alpha=0.99, epsilon=1e-5, max_grad_norm=0.5, lr_schedule='constant') -> A2C: """ Parameter's default values are taken from stable_baselines.a2c.a2c.py """ if vec_env is None: vec_env = create_training_env(1) return A2C(policy=policy, env=vec_env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=max_grad_norm, learning_rate=learning_rate, alpha=alpha, momentum=momentum, epsilon=epsilon, lr_schedule=lr_schedule, verbose=2)
def model_training_learning(env_train, model_name, timesteps=100000): # train model os.chdir("./model_saved/" + model_name) start = time.time() print("Train ", model_name, " Model with MlpPolicy: ") if model_name == "A2C_Model": model = A2C('MlpPolicy', env_train, verbose=0) elif model_name == "PPO_Model": model = PPO2('MlpPolicy', env_train, verbose=0) elif model_name == "TD3_Model": model = TD3('MlpPolicy', env_train, verbose=0) elif model_name == "SAC_Model": model = SAC('MlpPolicy', env_train, verbose=0) print("Learning ", model_name, " time steps: ", timesteps) model.learn(total_timesteps=timesteps) print("TD3 Model learning completed: ") end = time.time() timestamp = time.strftime('%b-%d-%Y_%H%M') model_file_name = (model_name + timestamp) model.save(model_file_name) print("- ", model_name, " save finish :") print("Training time ", model_name, " : ", (end - start) / 60, " minutes") os.chdir("./..") os.chdir("./..") return model
def train(game, num_timesteps, num_envs, dir_name, model_name, prev_model_name): dir_name = get_valid_filename(dir_name) model_name = get_valid_filename(model_name) log_dir = f"logs/{dir_name}/{model_name}-training" model_dir = f"models/{dir_name}" os.makedirs(log_dir, exist_ok=True) os.makedirs(model_dir, exist_ok=True) env = make_vec_envs(game, False, num_envs) prev_model_path = f"{model_dir}/{prev_model_name}.zip" if prev_model_name is not None and os.path.exists(prev_model_path): model = A2C.load(prev_model_path, env=env) model.tensorboard_log = log_dir else: model = A2C(policy="MlpPolicy", env=env, gamma=0.8, n_steps=64, learning_rate=0.00025, verbose=1, tensorboard_log=log_dir) model.learn(num_timesteps) model.save(f"{model_dir}/{model_name}.zip") env.close()
def _train(env_id, agent, model_params, total_steps, is_evaluation=False): if is_evaluation: # evaluate_policy() must only take one environment envs = SubprocVecEnv([make_env(env_id)]) else: envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)]) envs = VecNormalize( envs) # normalize the envs during training and evaluation # Load pretrained model during training. if not is_evaluation and os.path.exists(agent + '_' + env_id): if agent == 'ppo2': model = PPO2.load(agent + '_' + env_id) elif agent == 'a2c': model = A2C.load(agent + '_' + env_id) else: if agent == 'ppo2': model = PPO2(MlpLstmPolicy, envs, nminibatches=1, verbose=1, **model_params) elif agent == 'a2c': model = A2C(MlpLstmPolicy, envs, verbose=1, **model_params) model.learn(total_timesteps=total_steps) return envs, model
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env): """ Train A2C model for atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param num_env: (int) The number of environments """ policy_fn = None if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = CnnLstmPolicy elif policy == 'lnlstm': policy_fn = CnnLnLstmPolicy if policy_fn is None: raise ValueError("Error: policy {} not implemented".format(policy)) env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) model = A2C(policy_fn, env, lr_schedule=lr_schedule, seed=seed) model.learn(total_timesteps=int(num_timesteps * 1.1)) env.close()
def define_model(env, log_dir): if DEFAULT: policy_kwargs = dict() else: policy_kwargs = dict(act_fun=ACT_FUN, net_arch=NET_ARCH) if ALGORITHM == 'ppo2': model = PPO2(policy=MlpPolicy, env=env, policy_kwargs=policy_kwargs, verbose=0, tensorboard_log=log_dir) elif ALGORITHM == 'a2c': model = A2C(policy=MlpPolicy, env=env, policy_kwargs=policy_kwargs, verbose=0, tensorboard_log=log_dir) else: raise Exception('Specify proper algorithm') model_arch = model.get_parameter_list() print('\n--------------- Summary of archs ---------------') for model_param in model_arch: print(model_param) print('\n') return model
def test_monitor(): env = gym.make('Pendulum-v0') env = Monitor(gym.make('Pendulum-v0'), filename=None, allow_early_resets=True) normalized_env = NormalizeActionWrapper(env) normalized_env = DummyVecEnv([lambda: normalized_env]) # model model_2 = A2C('MlpPolicy', normalized_env, verbose=1).learn(1000)
def load(config, agent, epoch, from_disk=True): config = config['ai'] if not config['enabled']: logging.info("ai disabled") return False logging.info("[ai] bootstrapping dependencies ...") from stable_baselines import A2C from stable_baselines.common.policies import MlpLstmPolicy from stable_baselines.common.vec_env import DummyVecEnv import pwnagotchi.ai.gym as wrappers env = wrappers.Environment(agent, epoch) env = DummyVecEnv([lambda: env]) logging.info("[ai] bootstrapping model ...") a2c = A2C(MlpLstmPolicy, env, **config['params']) if from_disk and os.path.exists(config['path']): logging.info("[ai] loading %s ..." % config['path']) a2c.load(config['path'], env) else: logging.info("[ai] model created:") for key, value in config['params'].items(): logging.info(" %s: %s" % (key, value)) return a2c
def train(): """Trains an A2C policy """ env = create_env() model = A2C( policy = CnnPolicy, env = env, gamma = 0.99, n_steps = 5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=0.5, learning_rate=7e-4, alpha=0.99, epsilon=1e-05, lr_schedule='constant', verbose=1, tensorboard_log="./tb" ) model.learn( total_timesteps=int(1e7), callback=callback, tb_log_name="a2c" ) model.save("models/pacman_a2c.pkl")
def train_agent_with_a2c(load=False): from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import A2C # multiprocess environment n_cpu = 4 env = SubprocVecEnv([lambda: gym.make('F16GCAS-v0') for i in range(n_cpu)]) env = gym.make("F16GCAS-v0") class CustomPolicy(MlpPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128]) if not load: model = A2C(env=env, verbose=1, policy=CustomPolicy) # model.learn(total_timesteps=1000000) ExpData = ExpertDataset("./lqr_export.npz") model.pretrain(ExpData, n_epochs=100) else: model = A2C.load(ROOT+"/trained_models/TDRL/f16/a2c/128_128", env=env) with model.graph.as_default(): for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'): print(i) return model
def train_a2c(seed): """ test A2C on the uav_env(cartesian,discrete) :param seed: (int) random seed for A2C """ """ A2C(policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=0.5, learning_rate=0.0007, alpha=0.99, epsilon=1e-05, lr_schedule='linear', verbose=0,tensorboard_log=None, _init_setup_model=True) """ algo = 'A2C' num_timesteps = 3000000 env = set_up_env(seed) global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 model = A2C(policy=MlpPolicy, env=env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=0.5, learning_rate=0.0007, alpha=0.99, epsilon=1e-05, lr_schedule='linear', verbose=0, tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo)) model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed, log_interval=500, tb_log_name="seed_{}".format(seed)) model = A2C.load(log_dir + 'best_model.pkl') evaluation = evaluate_model(env, model, 100) os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True) os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed)) env.close() del model, env gc.collect() return evaluation
def __init__(self, method, K=5, P=0.95): self.method = method self.K = K self.state_size = self.K + 1 self.action_size = self.K + 1 self.reward = [] env_name = 'ErdosAttack-v0' self.log_dir = "/tmp/gym_attack/" os.makedirs(self.log_dir, exist_ok=True) env = gym.make(env_name) env.init_params(K, P) env = Monitor(env, self.log_dir, allow_early_resets=True) self.envs = DummyVecEnv([lambda: env]) if method == 'PPO': self.model = PPO2(MLP_PPO, self.envs, verbose=0) elif method == 'DQN': self.model = DQN(MLP_DQN, self.envs, verbose=0) elif method == 'A2C': self.model = A2C(MLP_A2C, self.envs, verbose=0) else: raise Exception("Erreur ! Méthode: 'PPO' ou 'DQN' ou 'A2C") print("Model Initialized !") self.best_mean_reward, self.n_steps = -np.inf, 0
def test_evaluate_policy(): model = A2C('MlpPolicy', 'Pendulum-v0', seed=0) n_steps_per_episode, n_eval_episodes = 200, 2 model.n_callback_calls = 0 def dummy_callback(locals_, _globals): locals_['model'].n_callback_calls += 1 _, episode_lengths = evaluate_policy(model, model.get_env(), n_eval_episodes, deterministic=True, render=False, callback=dummy_callback, reward_threshold=None, return_episode_rewards=True) n_steps = sum(episode_lengths) assert n_steps == n_steps_per_episode * n_eval_episodes assert n_steps == model.n_callback_calls # Reaching a mean reward of zero is impossible with the Pendulum env with pytest.raises(AssertionError): evaluate_policy(model, model.get_env(), n_eval_episodes, reward_threshold=0.0) episode_rewards, _ = evaluate_policy(model, model.get_env(), n_eval_episodes, return_episode_rewards=True) assert len(episode_rewards) == n_eval_episodes
def run_agent(envs, parameters): '''Train an agent.''' alg = parameters['alg'] learning_rate = parameters['learning_rate'] gamma = parameters['gamma'] model_path = parameters['model_path'] set_global_seeds(parameters.get('seed')) dummy_env = OptVecEnv(envs) if alg == 'PPO': model = PPO2(MlpPolicy, dummy_env, gamma=gamma, learning_rate=learning_rate, verbose=1, nminibatches=dummy_env.num_envs) elif alg == 'A2C': model = A2C(MlpPolicy, dummy_env, gamma=gamma, learning_rate=learning_rate, verbose=1) else: model = DDPG(ddpg.MlpPolicy, dummy_env, gamma=gamma, verbose=1, actor_lr=learning_rate / 10, critic_lr=learning_rate) try: model.learn(total_timesteps=parameters.get('total_timesteps', 10**6)) except tf.errors.InvalidArgumentError: LOGGER.error('Possible Nan, %s', str((alg, learning_rate, gamma))) finally: dummy_env.close() model.save(str(model_path))
def run(): sum_rewards = [] good_reward = False # env = SubprocVecEnv([lambda: simulation.atc_gym.AtcGym() for i in range(n_cpu)]) # env = DummyVecEnv([lambda : simulation.atc_gym.AtcGym()]) env = DummyVecEnv([lambda: simulation.simulator.AirplaneSimulator()]) # model = PPO2(MlpPolicy, env, verbose=1) model = A2C(MlpPolicy, env, verbose=1, tensorboard_log='./') model.learn(total_timesteps=num_train_steps) obs = env.reset() for i in range(num_eval_steps): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) sum_rewards.append(rewards[0]) if done: obs = env.reset() #print("Reward: ", rewards, "Done: ", done) if rewards[0] != -100: print("Reward: ", rewards, "Done: ", done) good_reward = True #env.render() if not good_reward: print("Boo") print("Average reward across {0} steps: {1}".format( num_eval_steps, sum(sum_rewards) / len(sum_rewards)))
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_a2c(trial) seed = trial.suggest_int('numpyseed', 1, 2147483647) np.random.seed(seed) original_env = gym.make('rustyblocks-v0', use_cnn=use_cnn, simple_reward=simple_reward) env = DummyVecEnv([lambda: original_env]) policy = "CnnPolicy" if use_cnn else "MlpPolicy" policy_kwargs = dict( net_arch=[dict(pi=[128, 128, 128], vf=[128, 128, 128])]) model = A2C(policy, env, verbose=0, policy_kwargs=policy_kwargs, **model_params) print("DOING LEARING a2c") original_env.force_progression = False has_nan = False def learn_callback(a, b): has_nan = np.isnan(a["actions"]).any() return not has_nan model.learn(int(2e4 * 5), seed=seed, callback=learn_callback) print("DONE LEARING a2c, wins gotten:", original_env.wins) if has_nan: trial.report(-15.0) print("ERRORED WITH NAN") return -15.0 rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() original_env.wins = 0 start = time.time() while n_episodes < 1000: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() end = time.time() last_reward = np.mean(rewards) trial.report(last_reward) print( "done testing parameters average reward and wins and time_elapsed are:", last_reward, original_env.wins, end - start) return last_reward
def main(experiment_params): # define experiment params with open(experiment_params, "rb") as f: params = json.load(f) global eval_increment, rh_kwargs, rh_schedule, model, eval_env # global variables for callback fn experiment = params['experiment_name'] env_name = params['env_name'] rh_kwargs = params['rh_kwargs'] rh_schedule = {int(k): v for k,v in params['rh_schedule'].items()} verbose_env = params['verbose_env'] total_steps = params['total_steps'] train_subset = params['train_subset'] eval_increment = params['eval_increment'] # setup env register( id=env_name, entry_point=f'gym_summarizer.envs:{env_name.split("-")[0]}', ) reward_helper = RewardHelper(**rh_schedule[0], **rh_kwargs) env = gym.make(env_name, reward_helper=reward_helper, verbose=verbose_env) env.data_loader.early_stop = train_subset env = DummyVecEnv([lambda: env]) # define callback function and evaluation env eval_dataloader = BatchCNNDMLoader('data/finished_files/test/') eval_env = gym.make(env_name, data_loader=eval_dataloader, reward_helper=RewardHelper('average', 'f', is_terminal=True), verbose=False) # define model and learn model = A2C(MlpPolicy, env, tensorboard_log=f"experiments/{experiment}/", verbose=0, n_steps=2) model.learn(total_timesteps=total_steps, callback=reward_schedule_callback) # save model and callbacks output model.save(f"{experiment}.model") with open(f"experiments/{experiment}_returns.pkl", 'wb') as f: pickle.dump(returns, f) with open(f"experiments/{experiment}_eval.pkl", 'wb') as f: pickle.dump(eval_scores, f) # plot returns df = pd.DataFrame(returns) plt.plot(df[0], 'lightblue', df[0].rolling(1000).mean(), 'blue') plt.title(f'Training: {experiment}') plt.xlabel('Num Episodes') plt.ylabel('Episode Reward') plt.legend(['Raw', 'Smoothed']) plt.show()
def load(config, agent, epoch, from_disk=True): config = config['ai'] if not config['enabled']: logging.info("ai disabled") return False try: begin = time.time() logging.info("[ai] bootstrapping dependencies ...") start = time.time() from stable_baselines import A2C logging.debug("[ai] A2C imported in %.2fs" % (time.time() - start)) start = time.time() from stable_baselines.common.policies import MlpLstmPolicy logging.debug("[ai] MlpLstmPolicy imported in %.2fs" % (time.time() - start)) start = time.time() from stable_baselines.common.vec_env import DummyVecEnv logging.debug("[ai] DummyVecEnv imported in %.2fs" % (time.time() - start)) start = time.time() import pwnagotchi.ai.gym as wrappers logging.debug("[ai] gym wrapper imported in %.2fs" % (time.time() - start)) env = wrappers.Environment(agent, epoch) env = DummyVecEnv([lambda: env]) logging.info("[ai] creating model ...") start = time.time() a2c = A2C(MlpLstmPolicy, env, **config['params']) logging.debug("[ai] A2C created in %.2fs" % (time.time() - start)) if from_disk and os.path.exists(config['path']): logging.info("[ai] loading %s ..." % config['path']) start = time.time() a2c.load(config['path'], env) logging.debug("[ai] A2C loaded in %.2fs" % (time.time() - start)) else: logging.info("[ai] model created:") for key, value in config['params'].items(): logging.info(" %s: %s" % (key, value)) logging.debug("[ai] total loading time is %.2fs" % (time.time() - begin)) return a2c except Exception as e: logging.exception("error while starting AI") logging.warning("[ai] AI not loaded!") return False
def load(agent, from_disk=True): try: env = wrappers.Environment(agent) env = DummyVecEnv([lambda: env]) a2c = A2C(MlpLstmPolicy, env) return a2c except Exception as e: logging.exception("error while starting AI") return False
def construct_model(seed: int) -> A2C: return A2C(MlpPolicy, env, verbose=0, tensorboard_log="/tmp/a2c/", ent_coef=0.0, gamma=0.95, n_cpu_tf_sess=1, seed=seed)
def train_a2c(env_name='fb-v0', e=0): # multiprocess environment n_cpu = 4 env = SubprocVecEnv([lambda: BookmakerEnv(data_path='csv_files/train-12122019.json', seed=i) for i in range(n_cpu)]) model = A2C(MlpPolicy, env, verbose=2, tensorboard_log='tensorboard') model.learn(total_timesteps=100000000, seed=1) model.save(f"saved/{A2C.__name__}_{env_name}_{e}")
def launch_training(nb_cpu,name_agent,name_env,total_timesteps,text): env_name = name_env #n_cpu = 8 n_cpu = nb_cpu policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[512,512]) print('TB available at := ',tensorboard_log_dir, file=sys.stderr) if name_agent =='A2C': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)]) model = A2C(MlpPolicy, env, n_steps=20,gamma = 0.9, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "A2C_default_Mlp"+text elif name_agent == 'PPO2': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)]) model = PPO2(MlpPolicy, env,n_steps=80,gamma = 0.97, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "PPO2_default_Mlp"+text elif name_agent == 'TRPO': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = DummyVecEnv([lambda: env_ for i in range(n_cpu)]) model = TRPO(MlpPolicy, env,gamma = 0.1, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "TRPO_default_Mlp"+text time = datetime.now().strftime('%Y-%m-%d_%H_%M_%S') log_name = f"_model={model_name}_time={time}" print('with the following line := ','tensorboard --logdir ',tensorboard_log_dir+log_name) training_log = open(f"{console_log_dir}/{log_name}.log", "a") sys.stdout = training_log logging.basicConfig(level=logging.INFO, filename=f"{console_log_dir}/{log_name}.log", datefmt='%H:%M:%S', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s') model_file_name = f"{models_log_dir}{log_name}_best.pkl" start = datetime.now() print("Learning model", file=sys.stderr) model.learn(total_timesteps=int(total_timesteps), tb_log_name=log_name, callback=callback) training_time = datetime.now() - start print(f"Training time: {training_time}", file=sys.stderr) print("Saving final model", file=sys.stderr) model.save(f"{models_log_dir}{log_name}_final.pkl")
def eval_model(model, test_env_id): global eval_step, THRESHOLD test_success, curriculum_success = True, True performance_data[eval_step] = {} for env_id in env_ids: write_out( "[MODEL EVAL]\tTesting learner on env: {}".format(env_id)) env, eval_env, eval_callback = init_env(env_id) fresh_model = A2C(CnnPolicy, env, verbose=verbose) fresh_model.learn(total_timesteps=max_steps, callback=eval_callback) fresh_mean, fresh_std = evaluate_policy(fresh_model, eval_env, n_eval_episodes=100) model_mean, model_std = evaluate_policy(model, eval_env, n_eval_episodes=100) performance_data[eval_step][env_id] = { 'baseline_mean': fresh_mean, 'baseline_std': fresh_std, 'model_mean': model_mean, 'model_std': model_std, 'baseline_training_steps': max_steps, 'eval_episodes': 100 } write_out( "[MODEL EVAL: LEARNER] \t env_id: {}, Mean Reward: {}, std_dev: {}" .format(env_id, model_mean, model_std)) write_out( "[MODEL EVAL: BASELINE]\t env_id: {}, Mean Reward: {}, std_dev: {}" .format(env_id, fresh_mean, fresh_std)) pass_test = round(model_mean - model_std, 3) >= round( fresh_mean - fresh_std, 3) diff = abs( round(model_mean - model_std, 3) - round(fresh_mean - fresh_std, 3)) if pass_test: write_out( "[TEST RESULT]\tmodel out-performs fresh model for env: {}, diff: {}" .format(env_id, diff)) else: write_out( "[TEST RESULT]\tmodel DID NOT out-perform fresh model for env: {}, diff: {}" .format(env_id, diff)) if env_id == test_env_id: test_success = False curriculum_success = sum([ performance_data[eval_step][env_id]['baseline_mean'] > THRESHOLD for env_id in env_ids ]) == len(env_ids) eval_step += 1 return test_success, curriculum_success
def make_new_model(model_type, policy, env, tensorboard_log=None): if model_type.lower() == 'dqn': model = DQN(policy, env, tensorboard_log=tensorboard_log) elif model_type.lower() == 'ppo2': model = PPO2(policy, env, tensorboard_log=tensorboard_log) elif model_type.lower() == 'a2c': model = A2C(policy, env, tensorboard_log=tensorboard_log) elif model_type.lower() == 'acktr': model = ACKTR(policy, env, tensorboard_log=tensorboard_log) return model
def make_model(config, env, action_noise_fun): model = None if config["algo_name"] == "A2C" and config["policy_name"] == "MlpPolicy": model = A2C(config["policy_name"], env=env, gamma=config["gamma"], n_steps=config["n_steps"], vf_coef=config["vf_coef"], ent_coef=config["ent_coef"], max_grad_norm=config["max_grad_norm"], learning_rate=config["sb2_learning_rate"], alpha=config["alpha"], epsilon=config["epsilon"], lr_schedule=config["lr_schedule"], verbose=config["verbose"], tensorboard_log="./tb/{}/".format(config["session_ID"]), full_tensorboard_log=config["full_tensorboard_log"], policy_kwargs=dict(net_arch=[ int(config["policy_hid_dim"]), int(config["policy_hid_dim"]) ])) if config["algo_name"] == "A2C" and config[ "policy_name"] == "MlpLstmPolicy": model = A2C(CustomLSTMPolicy, env=env, gamma=config["gamma"], n_steps=config["n_steps"], vf_coef=config["vf_coef"], ent_coef=config["ent_coef"], max_grad_norm=config["max_grad_norm"], learning_rate=config["sb2_learning_rate"], alpha=config["alpha"], epsilon=config["epsilon"], lr_schedule=config["lr_schedule"], verbose=config["verbose"], tensorboard_log="./tb/{}/".format(config["session_ID"]), full_tensorboard_log=config["full_tensorboard_log"]) assert model is not None, "Alg name not found, exiting. " return model
def train_A2C(env_train, model_name, timesteps=50000): """A2C model""" start = time.time() model = A2C('MlpPolicy', env_train, verbose=0) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (A2C): ', (end - start) / 60, ' minutes') return model