def test(test_data, model_location): # Using a different environment to test the model env_test = SubprocVecEnv( [lambda: ExchangeEnv.ExchangeEnv(test_data, 10000, 0)]) model = PPO2.load(model_location) obs = env_test.reset() done = False price_history = [] portfolio_value = [] while not done: action, _states = model.predict(obs) obs, rewards, done, _ = env_test.step(action) # Appending the current time steps highest bid price_history.append(obs[0][0][0]) # Appending current portfolio value portfolio_value.append(rewards[0]) with open("price_history.txt", "w") as f: writer = csv.writer(f) writer.writerow(price_history) with open("portfolio_value.txt", "w") as f: writer = csv.writer(f) writer.writerow(portfolio_value)
def environment(self, environment: 'BitmexEnvironment'): envs = [lambda: environment for _ in range(self._n_env)] if self._n_env == 1: self._environment = DummyVecEnv(envs) else: self._environment = SubprocVecEnv(envs)
def train(self, num_e=1, n_timesteps=1000000, save='saves/agent4'): env_id = "default" num_e = 1 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) self.env = SubprocVecEnv( [self.make_env(env_id, i) for i in range(num_e)]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #self.model = PPO2(policy=CnnPolicy, #env=SubprocVecEnv(self.env_fns), #n_steps=8192, #nminibatches=8, #lam=0.95, #gamma=0.99, #noptepochs=4, #ent_coef=0.001, #learning_rate=lambda _: 2e-5, #cliprange=lambda _: 0.2, #verbose=1, #tensorboard_log="./breakorbust") self.model = PPO2(CustomPolicy, env=self.env, verbose=0, learning_rate=1e-5, tensorboard_log=save) for i in range(10): self.model.learn(n_timesteps) self.model.save(save)
def main(): env = SubprocVecEnv([(lambda i=i: SwocGym( i + 1, GameServicePath, i, actionRepeat=4, oneTarget=True)) for i in range(4)]) try: model = PPO2("MlpPolicy", env, verbose=1, policy_kwargs={ 'net_arch': [256, 256, 256, 128, 128, 128], 'act_fun': tf.nn.relu }, n_steps=256, ent_coef=0.0, learning_rate=1e-5) if SaveFile.exists(): print('loading...') model.load_parameters(SaveFile) else: print('Warning: No save file loaded') print('evaluating...', end='') totalRewards = evaluate(env, model) print(f'mean reward: {np.mean(totalRewards)}') except KeyboardInterrupt: print('closing...') finally: env.close() print('closed')
def main(mode="train"): n_cpu = 2 env = SubprocVecEnv( [lambda: gym.make('balancebot-continuum-v0') for i in range(n_cpu)]) if mode == "train": model = ppo2(policy=MlpPolicy, env=env, learning_rate=1e-3, verbose=0, full_tensorboard_log=False, tensorboard_log="./ppo2_balancebot_tensorboard") model.learn(total_timesteps=100000, callback=callback) print("Saving model to ppo2_balance_continuum.pkl") model.save("ppo2_balance_continuum.pkl") del model # remove to demonstrate saving and loading if mode == "test": model = ppo2.load("ppo2_balance_continuum.pkl") obs = env.reset() done = [False, False] # env.set_done(5000) while not all(done): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) # env.render() print(obs)
def _train(env_id, agent, model_params, total_steps, is_evaluation=False): if is_evaluation: # evaluate_policy() must only take one environment envs = SubprocVecEnv([make_env(env_id)]) else: envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)]) envs = VecNormalize( envs) # normalize the envs during training and evaluation # Load pretrained model during training. if not is_evaluation and os.path.exists(agent + '_' + env_id): if agent == 'ppo2': model = PPO2.load(agent + '_' + env_id) elif agent == 'a2c': model = A2C.load(agent + '_' + env_id) else: if agent == 'ppo2': model = PPO2(MlpLstmPolicy, envs, nminibatches=1, verbose=1, **model_params) elif agent == 'a2c': model = A2C(MlpLstmPolicy, envs, verbose=1, **model_params) model.learn(total_timesteps=total_steps) return envs, model
def create_envs(self, game_name, state_name, num_env): for i in range(num_env): self.env_fns.append( partial(make_env, game=game_name, state=state_name)) self.env_names.append(game_name + '-' + state_name) self.env = SubprocVecEnv(self.env_fns)
def test(model_name, env_name, num_cpu, log_dir): env = SubprocVecEnv([ make_football_env(env_name, i, log_dir, useMonitor=False) for i in range(num_cpu) ]) # env = Monitor(env, log_dir, allow_early_resets=True) model = get_model(model_name, env, log_dir) model = model.load(log_dir + model_name + '_' + env_name, env=env) obs = env.reset() from matplotlib import pyplot as plt show_num = 1 while True: action, _states = model.predict(obs) # obs, rewards, done, info = env.step([int(input('action:'))]*num_cpu) obs, rewards, done, info = env.step(action) img = obs[show_num, :, :, :] fig = plt.figure(0) plt.clf() plt.imshow(img / 255) fig.canvas.draw() # env.render() plt.pause(0.000001)
def generate(parameter_distribution, num_episodes, env_update_fn, filepath=None, n_cpu=6): env_name = 'CartPole-v1' model_dir = os.path.join(os.getcwd(), 'models') model_path = os.path.join(model_dir, 'ppo2_' + env_name + '.pkl') os.makedirs(model_dir, exist_ok=True) os.makedirs(os.path.dirname(filepath), exist_ok=True) def make_env(env_name): env = gym.make(env_name) return env env = SubprocVecEnv([lambda: make_env(env_name) for i in range(n_cpu)]) try: model = PPO2.load(model_path) except Exception as e: trainer = CartPoleTrainer(env) model = trainer.train(model_path) obs = env.reset() env = make_env(env_name) states, actions, next_states, parameters, steps = [], [], [], [], [] for ep in range(num_episodes): obs = env.reset() params = parameter_distribution() env_update_fn(env.unwrapped, params) done = False step = 0 while not done: action, _states = model.predict(obs) states.append(obs) actions.append([action]) obs, reward, done, info = env.step(action) next_states.append(obs) parameters.append(params) steps.append(step) step += 1 data = { 'states': np.array(states), 'actions': np.array(actions), 'next_states': np.array(next_states), 'parameters': np.array(parameters), 'steps': np.array(steps) } if filepath: print('filepath: ', filepath) with open(filepath, 'wb') as f: np.save(filepath, data) return data
def launch_training(nb_cpu,name_agent,name_env,total_timesteps,text): env_name = name_env #n_cpu = 8 n_cpu = nb_cpu policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[512,512]) print('TB available at := ',tensorboard_log_dir, file=sys.stderr) if name_agent =='A2C': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)]) model = A2C(MlpPolicy, env, n_steps=20,gamma = 0.9, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "A2C_default_Mlp"+text elif name_agent == 'PPO2': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)]) model = PPO2(MlpPolicy, env,n_steps=80,gamma = 0.97, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "PPO2_default_Mlp"+text elif name_agent == 'TRPO': env_ = FluidMechanicsEnv() env_ = Monitor(env_, console_log_dir,allow_early_resets=True) env = DummyVecEnv([lambda: env_ for i in range(n_cpu)]) model = TRPO(MlpPolicy, env,gamma = 0.1, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs) #model = A2C.load("first_test") model_name = "TRPO_default_Mlp"+text time = datetime.now().strftime('%Y-%m-%d_%H_%M_%S') log_name = f"_model={model_name}_time={time}" print('with the following line := ','tensorboard --logdir ',tensorboard_log_dir+log_name) training_log = open(f"{console_log_dir}/{log_name}.log", "a") sys.stdout = training_log logging.basicConfig(level=logging.INFO, filename=f"{console_log_dir}/{log_name}.log", datefmt='%H:%M:%S', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s') model_file_name = f"{models_log_dir}{log_name}_best.pkl" start = datetime.now() print("Learning model", file=sys.stderr) model.learn(total_timesteps=int(total_timesteps), tb_log_name=log_name, callback=callback) training_time = datetime.now() - start print(f"Training time: {training_time}", file=sys.stderr) print("Saving final model", file=sys.stderr) model.save(f"{models_log_dir}{log_name}_final.pkl")
def run_experiment(args): hyperparam_file = os.path.join(HYPERPARAM_DIR, args.agent + ".yml") hyperparams = yaml.safe_load(open(hyperparam_file)) hyperparams = hyperparams[args.env] n_envs = hyperparams.pop("n_envs", 1) n_timesteps = int(hyperparams.pop("n_timesteps")) policy = hyperparams.pop("policy") normalize = hyperparams.pop("normalize", None) vecEnv = [] for i in range(n_envs): # Bit of trickery here to avoid referencing # to the same "i" vecEnv.append((lambda idx: lambda: create_env(args, idx))(i)) if args.subprocenv: vecEnv = SubprocVecEnv(vecEnv) else: vecEnv = DummyVecEnv(vecEnv) # Handle learning rates # Taken from rl-zoo/train.py for key in ['learning_rate', 'cliprange', 'cliprange_vf']: if key not in hyperparams or args.agent == "dqn": continue if key == 'learning_rate' and args.agent == "a2c": continue if isinstance(hyperparams[key], str): schedule, initial_value = hyperparams[key].split('_') initial_value = float(initial_value) hyperparams[key] = linear_schedule(initial_value) elif isinstance(hyperparams[key], (float, int)): # Negative value: ignore (ex: for clipping) if hyperparams[key] < 0: continue hyperparams[key] = constfn(float(hyperparams[key])) if args.forced_cliprange is not None: hyperparams["cliprange"] = args.forced_cliprange agent_class = AVAILABLE_ALGORITHMS[args.agent] agent = agent_class(policy, vecEnv, verbose=1, **hyperparams) # Prepare callback checkpoint_dir = os.path.join(args.output, CHECKPOINT_DIR) os.makedirs(checkpoint_dir) # Note that save_freq is counted in number of agent step-calls, # not env step-calls. save_freq = n_timesteps // (args.num_snapshots * n_envs) checkpoint_callback = CheckpointCallback(save_freq, checkpoint_dir) agent.learn(total_timesteps=n_timesteps, callback=checkpoint_callback) vecEnv.close()
def __init__(self, version, envs, hours = 0, verbose = False, weights = None): self.version = version self.name = "football-ppo{}".format(version) + "-e{}" self.path = "models/football-ppo-{}/".format(version) self.defaults = { "env_name": "", "representation": "simple115", "rewards": "scoring", "render": False, "write_video": False, "dump_frequency": 1, "extra_players": None, "number_of_left_players_agent_controls": 1, "number_of_right_players_agent_controls": 0, "enable_sides_swap": False, "parallel": 1 } self.configs = list(map(lambda b: dict(map(lambda a: (a[0], a[1] if a[0] not in b.keys() else b[a[0]]), self.defaults.items())), envs)) self.training = SubprocVecEnv(reduce(lambda a, b: a + b, list(map(lambda config: [ lambda: football.create_environment( env_name = config["env_name"], representation = config["representation"], rewards = config["rewards"], render = config["render"], write_video = config["write_video"], dump_frequency = config["dump_frequency"], extra_players = config["extra_players"], number_of_left_players_agent_controls = config["number_of_left_players_agent_controls"], number_of_right_players_agent_controls = config["number_of_right_players_agent_controls"], enable_sides_swap = config["enable_sides_swap"] ) for _ in range(config["parallel"]) ], self.configs)), [])) self.inputs = self.training.get_attr("observation_space")[0].shape[0] self.outputs = self.training.get_attr("action_space")[0].n self.verbose = verbose if not verbose: os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" deprecation._PRINT_DEPRECATION_WARNINGS = False logger = logging.getLogger() logger.setLevel(logging.ERROR) if weights == None: self.model = PPO2(policy = MlpPolicy, env = self.training, verbose = int(self.verbose)) else: self.model = PPO2.load(weights, env = self.training, learning_rate = 0.002) self.experience = hours * 60
def train(): n_cpu = os.cpu_count() env = SubprocVecEnv([lambda: DemoEnv() for i in range(n_cpu)]) model = PPO2(MlpPolicy, env, verbose=1, policy_kwargs={'net_arch': [dict(vf=[4], pi=[4])]}) model.learn(total_timesteps=int(1e6)) model.save("ppo2_DemoEnv") env.close() del model
def get_rewards(self, skills=[], train_total_timesteps=5000000, eval_times=100, eval_max_steps=10000, model_save_name=None, add_info={}): # def get_rewards(self, skills=[], train_total_timesteps=10, eval_times=10, eval_max_steps=10, model_save_name=None, add_info={}): """ :param skills: (list) the availiable action sequence for agent e.g [[0,2,2],[0,1,1]] :param train_total_timesteps: (int)total_timesteps to train :param eval_times: (int)the evaluation times e.g eval_times=100, evalulate the policy by averageing the reward of 100 episode :param eval_max_steps: (int)maximum timesteps per episode when evaluate :param model_save_name: (str)specify the name of saved model (should not repeat) :param add_info: (dict) other information to log in log.txt """ # env = SkillWrapper(self.env, skills=skills) if self.num_cpu > 1: env = SubprocVecEnv([ self.make_env(self.env_creator, i, skills) for i in range(self.num_cpu) ]) else: env = DummyVecEnv([lambda: self.env_creator()]) model = self.model(self.policy, env, verbose=self.verbose) self.strat_time = time.time() print("start to train agent...") model.learn(total_timesteps=train_total_timesteps, reset_num_timesteps=self.reset_num_timesteps) print("Finish train agent") if self.save_path is not None: if self.preserve_model > 0: self.save_model(model, model_save_name, skills=skills) # evaluate info = self.evaluate(env, model, eval_times, eval_max_steps) env.close() #log result info.update(add_info) self.log(info) self._serial_num = self._serial_num + 1 return info["ave_score"], info["ave_action_reward"]
def run_training(config: Dict): """Runs training based on config passed in""" print("Run configuration:") print(config) seed(config['seed']) # read config hyperparameters = read_hyperparameters(config) graphs = graphs_from_args(config['graphs']) policy, policy_kwargs = policy_from_args(config, graphs) demands = demands_from_args(config, graphs) env_kwargs = env_kwargs_from_args(config) env_name = config['env_name'] timesteps = config['timesteps'] parallelism = config['parallelism'] log_name = config['log_name'] model_name = config['model_name'] tensorboard_log = config['tensorboard_log'] oblivious_routings = None # make env env = lambda: gym.make(env_name, dm_sequence=demands, graphs=graphs, oblivious_routings=oblivious_routings, **env_kwargs) vec_env = SubprocVecEnv([env for _ in range(parallelism)], start_method="spawn") # make model model = PPO2(policy, vec_env, cliprange_vf=-1, verbose=1, policy_kwargs=policy_kwargs, tensorboard_log=tensorboard_log, **hyperparameters) # learn if env_name == 'ddr-iterative-v0': model.learn(total_timesteps=timesteps, tb_log_name=log_name) else: model.learn(total_timesteps=timesteps, tb_log_name=log_name) # save it model.save(model_name) # make sure everything stopped correctly vec_env.close()
def main(): #env = SubprocVecEnv([(lambda i=i: SwocGym(i+1, GameServicePath, i, fieldWidth=10, fieldHeight=10, saveEpisode=True)) for i in range(1)]) env = SubprocVecEnv([ (lambda i=i: MazeGym(mazeWidth=10, mazeHeight=10, nrWallsToRemove=10)) for i in range(1) ]) try: model = PPO2("MlpPolicy", env, verbose=1, tensorboard_log='/home/ralph/swoc2019/log') if SaveFile.exists(): print('loading...') model.load_parameters(SaveFile) else: print('Warning: No save file loaded') print('evaluating...', end='') obs = env.reset() totalRewards = None for i in range(100): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) totalRewards = totalRewards + rewards if totalRewards is not None else rewards env.render() sleep(0.2) print(f'mean reward: {np.mean(totalRewards)}') except KeyboardInterrupt: print('closing...') finally: env.close() print('closed')
def run(): torch.multiprocessing.freeze_support() env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) model = ACKTR(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action)
def __init__(self, _make_env_func, parallel_agents): """ This class instantiates a dynamics model based on the pybullet simulator (i.e: simulates exactly the result of the actions), it can be used for reward tuning and verifying tasks..etc :param _make_env_func: (func) a function if called it will return a gym environment. :param parallel_agents: (int) number of parallel agents to siumulate to evaluate the actions. """ self.parallel_agents = parallel_agents self.envs = SubprocVecEnv( [_make_env_func() for i in range(self.parallel_agents)]) return
class SimulatorModel(object): def __init__(self, _make_env_func, parallel_agents): """ This class instantiates a dynamics model based on the pybullet simulator (i.e: simulates exactly the result of the actions), it can be used for reward tuning and verifying tasks..etc :param _make_env_func: (func) a function if called it will return a gym environment. :param parallel_agents: (int) number of parallel agents to siumulate to evaluate the actions. """ self.parallel_agents = parallel_agents self.envs = SubprocVecEnv( [_make_env_func() for i in range(self.parallel_agents)]) return def evaluate_trajectories(self, action_sequences): """ A function to be called to evaluate the action sequences and return the corresponding reward for each sequence. :param action_sequences: (nd.array) actions to be evaluated (number of sequences, horizon length) :return: (nd.array) sum of rewards for each action sequence. """ horizon_length = action_sequences.shape[1] num_of_particles = action_sequences.shape[0] rewards = np.zeros([num_of_particles]) assert ((float(num_of_particles) / self.parallel_agents).is_integer()) for j in range(0, num_of_particles, self.parallel_agents): self.envs.reset() total_reward = np.zeros([self.parallel_agents]) for k in range(horizon_length): actions = action_sequences[j:j + self.parallel_agents, k] task_observations, current_reward, done, info = \ self.envs.step(actions) total_reward += current_reward rewards[j:j + self.parallel_agents] = total_reward return rewards def end_sim(self): """ Closes the environments that were used for simulation. :return: """ self.envs.close() return
def train(train_data): # The algorithms require a vectorized environment to run env_train = SubprocVecEnv( [lambda: ExchangeEnv.ExchangeEnv(train_data, 10000, 0)]) # In the paper a policy with a feed forward network with two hidden layers each consisting of 64 neurons was used policy_kwargs = dict(net_arch=[64, 64]) # From the paper: # Lambda = 0.95 # Clipping parameter = 0.2 # cvf = 0.5 # cH = 0.01 # Adam minibatch = 4 # Learning rate = 0.00025 # Trained over 10,000,000 time steps model = PPO2(policy=MlpPolicy, env=env_train, policy_kwargs=policy_kwargs, lam=0.95, cliprange=0.2, vf_coef=0.5, ent_coef=0.01, nminibatches=4, learning_rate=0.00025, verbose=1) model.learn(total_timesteps=10000000) model.save('ppo2_trader')
def train(): if not os.path.isdir("log/"): os.mkdir("log") if ENV_COUNT == 1: envs = create_env_headless() env_id = str(time.time())[-6:] envs = Monitor(envs, "log/" + MODEL_NAME + "-" + env_id, allow_early_resets=False) vec_envs = DummyVecEnv([lambda: envs]) else: vec_envs = [] def make_env(): env_id = str(time.time())[-6:] env = create_env_headless() return Monitor(env, "log/" + MODEL_NAME + "-" + env_id, allow_early_resets=False) for _ in range(ENV_COUNT): vec_envs.append(make_env) vec_envs = SubprocVecEnv(vec_envs) model = PPO2('CnnPolicy', vec_envs, verbose=1, ent_coef=0.0001, n_steps=256) model.learn(total_timesteps=TIMESTEPS) model.save(MODEL_NAME) vec_envs.close() print("Learning Done!")
def run_experiment(exp_num, exp_type, variants, n_cpu, step_total, exp_log, log_dict, drive, og_dir): model_names = [] run_path = '' for order, variant in enumerate(variants): alter_env(exp_type, variant) env = gym.make("Real-v0") env = Monitor(env, 'tf_save', allow_early_resets=True) env = SubprocVecEnv([lambda: env for i in range(n_cpu)]) if order == 0: model = PPO2(MlpPolicy, env, verbose=0, tensorboard_log="./tensorboard_log/", drive=drive, og_dir=og_dir) else: pydrive_util.download_file(drive, run_path + '/checkpoint') load_name = load_checkpoint(-1, run_path) pydrive_util.download_file(drive, load_name) model = PPO2.load('tmp/tmp_file', env=env, drive=drive, og_dir=og_dir) model_names.append(model.model_name) run_path = model.graph_dir model.learn(total_timesteps=step_total) pydrive_util.upload_file(drive, model.checkpoint_log) env.close() del model, env log_experiments(exp_num, exp_type, variants, model_names, exp_log, log_dict, drive)
def train(self, game, state, num_e=1, n_timesteps=25000000, save='default2'): self.create_envs(game_name=game, state_name=state, num_env=num_e) #self.model = PPO2.load("default2", SubprocVecEnv(self.env_fns), policy=CnnPolicy, tensorboard_log="./sonic/" ) #self.model = PPO2(CnnPolicy, SubprocVecEnv(self.env_fns), learning_rate=1e-5, verbose=1,tensorboard_log="./sonic/" ) self.model = PPO2(policy=CnnPolicy, env=SubprocVecEnv(self.env_fns), n_steps=8192, nminibatches=8, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=0.001, learning_rate=lambda _: 2e-5, cliprange=lambda _: 0.2, verbose=1, tensorboard_log="./sonic/") self.model.learn(n_timesteps) self.model.save(save) self.model.learn(n_timesteps) self.model.save(save + '2') self.model.learn(n_timesteps) self.model.save(save + '3') self.model.learn(n_timesteps) self.model.save(save + '4')
def get_multi_process_env(model_settings, model_path, num_of_envs, ckpt_step): def _make_env(rank): def _init(): task = generate_task( model_settings['benchmarks']['task_generator_id'], **model_settings['task_configs']) env = CausalWorld(task=task, **model_settings['world_params'], seed=model_settings['world_seed'] + rank) env = CurriculumWrapper( env, intervention_actors=model_settings["intervention_actors"], actives=model_settings["actives"]) if ckpt_step is None: prefix = 0 else: prefix = ckpt_step monitor_file = os.path.join(model_path, str(rank) + '_' + str(prefix)) env = Monitor(env, filename=monitor_file, info_keywords=('fractional_success', )) return env return _init return SubprocVecEnv([_make_env(rank=i) for i in range(num_of_envs)])
def train(env_id="highway-v0", num_cpu=4, log_dir=None, n_steps=1e3, log_step=100): env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) model = PPO2("MlpPolicy", env, verbose=1, n_steps=16) for i in trange(int(n_steps // log_step)): model.learn(total_timesteps=int(log_step)) model.save(os.path.join(logdir, f"highway_{i}")) env1 = gym.make(env_id) model1 = PPO2.load(os.path.join(logdir, f"highway_{i}")) obs = env1.reset() net_reward = 0 for j in range(1000): action, _states = model1.predict(obs) # print("Action:",action) obs, rewards, dones, info = env1.step(action) net_reward += rewards print("rewards") env1.render() if dones: file_writer.add_scalar('Episode Reward', net_reward, i * log_step) file_writer.add_scalar('Episode Length', j, i * log_step) break del env1, model1
def test_lstm_train(): """Test that LSTM models are able to achieve >=150 (out of 500) reward on CartPoleNoVelEnv. This environment requires memory to perform well in.""" def make_env(i): env = CartPoleNoVelEnv() env = TimeLimit(env, max_episode_steps=500) env = bench.Monitor(env, None, allow_early_resets=True) env.seed(i) return env env = SubprocVecEnv([lambda: make_env(i) for i in range(NUM_ENVS)]) env = VecNormalize(env) model = PPO2(MlpLstmPolicy, env, n_steps=128, nminibatches=NUM_ENVS, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, verbose=1) eprewmeans = [] def reward_callback(local, _): nonlocal eprewmeans eprewmeans.append(safe_mean([ep_info['r'] for ep_info in local['ep_info_buf']])) model.learn(total_timesteps=100000, callback=reward_callback) # Maximum episode reward is 500. # In CartPole-v1, a non-recurrent policy can easily get >= 450. # In CartPoleNoVelEnv, a non-recurrent policy doesn't get more than ~50. # LSTM policies can reach above 400, but it varies a lot between runs; consistently get >=150. # See PR #244 for more detailed benchmarks. average_reward = sum(eprewmeans[-NUM_EPISODES_FOR_SCORE:]) / NUM_EPISODES_FOR_SCORE assert average_reward >= 150, "Mean reward below 150; per-episode rewards {}".format(average_reward)
def train_policy(num_of_envs, log_relative_path, maximum_episode_length, skip_frame, seed_num, sac_config, total_time_steps, validate_every_timesteps, task_name): def _make_env(rank): def _init(): task = generate_task(task_generator_id=task_name) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=seed_num + rank, max_episode_length=maximum_episode_length) env = HERGoalEnvWrapper(env) return env set_global_seeds(seed_num) return _init os.makedirs(log_relative_path) env = SubprocVecEnv([_make_env(rank=i) for i in range(num_of_envs)]) model = HER('MlpPolicy', env, SAC, verbose=1, policy_kwargs=dict(layers=[256, 256, 256]), **sac_config) save_config_file(sac_config, _make_env(0)(), os.path.join(log_relative_path, 'config.json')) for i in range(int(total_time_steps / validate_every_timesteps)): model.learn(total_timesteps=validate_every_timesteps, tb_log_name="sac", reset_num_timesteps=False) model.save(os.path.join(log_relative_path, 'saved_model')) return
def train_agent_with_a2c(load=False): from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import SubprocVecEnv from stable_baselines import A2C # multiprocess environment n_cpu = 4 env = SubprocVecEnv([lambda: gym.make('F16GCAS-v0') for i in range(n_cpu)]) env = gym.make("F16GCAS-v0") class CustomPolicy(MlpPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128]) if not load: model = A2C(env=env, verbose=1, policy=CustomPolicy) # model.learn(total_timesteps=1000000) ExpData = ExpertDataset("./lqr_export.npz") model.pretrain(ExpData, n_epochs=100) else: model = A2C.load(ROOT+"/trained_models/TDRL/f16/a2c/128_128", env=env) with model.graph.as_default(): for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'): print(i) return model
def run_model_stablebaseline(flow_params, num_cpus=1, rollout_size=50, num_steps=50): """Run the model for num_steps if provided. Parameters ---------- num_cpus : int number of CPUs used during training rollout_size : int length of a single rollout num_steps : int total number of training steps The total rollout length is rollout_size. Returns ------- stable_baselines.* the trained model """ if num_cpus == 1: constructor = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: constructor]) else: env = SubprocVecEnv([ env_constructor(params=flow_params, version=i) for i in range(num_cpus) ]) train_model = PPO2('MlpPolicy', env, verbose=1, n_steps=rollout_size) train_model.learn(total_timesteps=num_steps) return train_model
def main(): env = SubprocVecEnv([lambda: NetworkEnv() for i in range(100)]) model = PPO2("CustomPolicy", env, verbose=0, gamma=0.2) #) #model = PPO2.load("NetworkModel",env=env) model.learn(total_timesteps=10000000) model.save("NetworkModel")