Пример #1
0
def test(test_data, model_location):
    # Using a different environment to test the model
    env_test = SubprocVecEnv(
        [lambda: ExchangeEnv.ExchangeEnv(test_data, 10000, 0)])
    model = PPO2.load(model_location)
    obs = env_test.reset()
    done = False

    price_history = []
    portfolio_value = []

    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, _ = env_test.step(action)

        # Appending the current time steps highest bid
        price_history.append(obs[0][0][0])

        # Appending current portfolio value
        portfolio_value.append(rewards[0])

    with open("price_history.txt", "w") as f:
        writer = csv.writer(f)
        writer.writerow(price_history)

    with open("portfolio_value.txt", "w") as f:
        writer = csv.writer(f)
        writer.writerow(portfolio_value)
    def environment(self, environment: 'BitmexEnvironment'):
        envs = [lambda: environment for _ in range(self._n_env)]

        if self._n_env == 1:
            self._environment = DummyVecEnv(envs)
        else:
            self._environment = SubprocVecEnv(envs)
Пример #3
0
    def train(self, num_e=1, n_timesteps=1000000, save='saves/agent4'):
        env_id = "default"
        num_e = 1  # Number of processes to use
        # Create the vectorized environment
        #env = DummyVecEnv([lambda: env])

        self.env = SubprocVecEnv(
            [self.make_env(env_id, i) for i in range(num_e)])
        self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
        #self.model = PPO2(policy=CnnPolicy,
        #env=SubprocVecEnv(self.env_fns),
        #n_steps=8192,
        #nminibatches=8,
        #lam=0.95,
        #gamma=0.99,
        #noptepochs=4,
        #ent_coef=0.001,
        #learning_rate=lambda _: 2e-5,
        #cliprange=lambda _: 0.2,
        #verbose=1,
        #tensorboard_log="./breakorbust")
        self.model = PPO2(CustomPolicy,
                          env=self.env,
                          verbose=0,
                          learning_rate=1e-5,
                          tensorboard_log=save)
        for i in range(10):
            self.model.learn(n_timesteps)
            self.model.save(save)
Пример #4
0
def main():
    env = SubprocVecEnv([(lambda i=i: SwocGym(
        i + 1, GameServicePath, i, actionRepeat=4, oneTarget=True))
                         for i in range(4)])
    try:
        model = PPO2("MlpPolicy",
                     env,
                     verbose=1,
                     policy_kwargs={
                         'net_arch': [256, 256, 256, 128, 128, 128],
                         'act_fun': tf.nn.relu
                     },
                     n_steps=256,
                     ent_coef=0.0,
                     learning_rate=1e-5)
        if SaveFile.exists():
            print('loading...')
            model.load_parameters(SaveFile)
        else:
            print('Warning: No save file loaded')

        print('evaluating...', end='')
        totalRewards = evaluate(env, model)
        print(f'mean reward: {np.mean(totalRewards)}')

    except KeyboardInterrupt:
        print('closing...')
    finally:
        env.close()
    print('closed')
def main(mode="train"):

    n_cpu = 2
    env = SubprocVecEnv(
        [lambda: gym.make('balancebot-continuum-v0') for i in range(n_cpu)])

    if mode == "train":
        model = ppo2(policy=MlpPolicy,
                     env=env,
                     learning_rate=1e-3,
                     verbose=0,
                     full_tensorboard_log=False,
                     tensorboard_log="./ppo2_balancebot_tensorboard")

        model.learn(total_timesteps=100000, callback=callback)
        print("Saving model to ppo2_balance_continuum.pkl")
        model.save("ppo2_balance_continuum.pkl")

        del model  # remove to demonstrate saving and loading

    if mode == "test":
        model = ppo2.load("ppo2_balance_continuum.pkl")

        obs = env.reset()
        done = [False, False]
        # env.set_done(5000)
        while not all(done):
            action, _states = model.predict(obs)
            obs, rewards, done, info = env.step(action)
            # env.render()
            print(obs)
Пример #6
0
def _train(env_id, agent, model_params, total_steps, is_evaluation=False):
    if is_evaluation:  # evaluate_policy() must only take one environment
        envs = SubprocVecEnv([make_env(env_id)])
    else:
        envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)])
    envs = VecNormalize(
        envs)  # normalize the envs during training and evaluation

    # Load pretrained model during training.
    if not is_evaluation and os.path.exists(agent + '_' + env_id):
        if agent == 'ppo2':
            model = PPO2.load(agent + '_' + env_id)
        elif agent == 'a2c':
            model = A2C.load(agent + '_' + env_id)
    else:
        if agent == 'ppo2':
            model = PPO2(MlpLstmPolicy,
                         envs,
                         nminibatches=1,
                         verbose=1,
                         **model_params)
        elif agent == 'a2c':
            model = A2C(MlpLstmPolicy, envs, verbose=1, **model_params)

    model.learn(total_timesteps=total_steps)
    return envs, model
Пример #7
0
    def create_envs(self, game_name, state_name, num_env):

        for i in range(num_env):
            self.env_fns.append(
                partial(make_env, game=game_name, state=state_name))
            self.env_names.append(game_name + '-' + state_name)
        self.env = SubprocVecEnv(self.env_fns)
Пример #8
0
def test(model_name, env_name, num_cpu, log_dir):
    env = SubprocVecEnv([
        make_football_env(env_name, i, log_dir, useMonitor=False)
        for i in range(num_cpu)
    ])
    # env = Monitor(env, log_dir, allow_early_resets=True)
    model = get_model(model_name, env, log_dir)

    model = model.load(log_dir + model_name + '_' + env_name, env=env)

    obs = env.reset()
    from matplotlib import pyplot as plt
    show_num = 1
    while True:
        action, _states = model.predict(obs)
        # obs, rewards, done, info = env.step([int(input('action:'))]*num_cpu)
        obs, rewards, done, info = env.step(action)
        img = obs[show_num, :, :, :]
        fig = plt.figure(0)
        plt.clf()
        plt.imshow(img / 255)
        fig.canvas.draw()

        # env.render()
        plt.pause(0.000001)
Пример #9
0
def generate(parameter_distribution,
             num_episodes,
             env_update_fn,
             filepath=None,
             n_cpu=6):
    env_name = 'CartPole-v1'
    model_dir = os.path.join(os.getcwd(), 'models')
    model_path = os.path.join(model_dir, 'ppo2_' + env_name + '.pkl')

    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(os.path.dirname(filepath), exist_ok=True)

    def make_env(env_name):
        env = gym.make(env_name)
        return env

    env = SubprocVecEnv([lambda: make_env(env_name) for i in range(n_cpu)])

    try:
        model = PPO2.load(model_path)
    except Exception as e:
        trainer = CartPoleTrainer(env)
        model = trainer.train(model_path)

    obs = env.reset()

    env = make_env(env_name)

    states, actions, next_states, parameters, steps = [], [], [], [], []

    for ep in range(num_episodes):
        obs = env.reset()
        params = parameter_distribution()
        env_update_fn(env.unwrapped, params)

        done = False
        step = 0
        while not done:
            action, _states = model.predict(obs)
            states.append(obs)
            actions.append([action])
            obs, reward, done, info = env.step(action)
            next_states.append(obs)
            parameters.append(params)
            steps.append(step)
            step += 1

    data = {
        'states': np.array(states),
        'actions': np.array(actions),
        'next_states': np.array(next_states),
        'parameters': np.array(parameters),
        'steps': np.array(steps)
    }
    if filepath:
        print('filepath: ', filepath)
        with open(filepath, 'wb') as f:
            np.save(filepath, data)

    return data
Пример #10
0
def launch_training(nb_cpu,name_agent,name_env,total_timesteps,text):

    env_name = name_env
    #n_cpu = 8
    n_cpu = nb_cpu

    policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[512,512])

    print('TB available at := ',tensorboard_log_dir, file=sys.stderr)
    if name_agent =='A2C':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)])
        model = A2C(MlpPolicy, env, n_steps=20,gamma = 0.9, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "A2C_default_Mlp"+text
    elif name_agent == 'PPO2':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)])
        model = PPO2(MlpPolicy, env,n_steps=80,gamma = 0.97, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "PPO2_default_Mlp"+text
    elif name_agent == 'TRPO':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = DummyVecEnv([lambda: env_ for i in range(n_cpu)])

        model = TRPO(MlpPolicy, env,gamma = 0.1, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "TRPO_default_Mlp"+text


    time = datetime.now().strftime('%Y-%m-%d_%H_%M_%S')

    log_name = f"_model={model_name}_time={time}"
    print('with the following line := ','tensorboard --logdir ',tensorboard_log_dir+log_name)
    training_log = open(f"{console_log_dir}/{log_name}.log", "a")
    sys.stdout = training_log
    logging.basicConfig(level=logging.INFO, filename=f"{console_log_dir}/{log_name}.log", datefmt='%H:%M:%S',
                        format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s')
    model_file_name = f"{models_log_dir}{log_name}_best.pkl"


    start = datetime.now()
    print("Learning model", file=sys.stderr)

    model.learn(total_timesteps=int(total_timesteps), tb_log_name=log_name, callback=callback)

    training_time = datetime.now() - start
    print(f"Training time: {training_time}", file=sys.stderr)

    print("Saving final model", file=sys.stderr)
    model.save(f"{models_log_dir}{log_name}_final.pkl")
def run_experiment(args):
    hyperparam_file = os.path.join(HYPERPARAM_DIR, args.agent + ".yml")
    hyperparams = yaml.safe_load(open(hyperparam_file))

    hyperparams = hyperparams[args.env]

    n_envs = hyperparams.pop("n_envs", 1)
    n_timesteps = int(hyperparams.pop("n_timesteps"))
    policy = hyperparams.pop("policy")
    normalize = hyperparams.pop("normalize", None)

    vecEnv = []
    for i in range(n_envs):
        # Bit of trickery here to avoid referencing
        # to the same "i"
        vecEnv.append((lambda idx: lambda: create_env(args, idx))(i))

    if args.subprocenv:
        vecEnv = SubprocVecEnv(vecEnv)
    else:
        vecEnv = DummyVecEnv(vecEnv)

    # Handle learning rates
    # Taken from rl-zoo/train.py
    for key in ['learning_rate', 'cliprange', 'cliprange_vf']:
        if key not in hyperparams or args.agent == "dqn":
            continue
        if key == 'learning_rate' and args.agent == "a2c":
            continue
        if isinstance(hyperparams[key], str):
            schedule, initial_value = hyperparams[key].split('_')
            initial_value = float(initial_value)
            hyperparams[key] = linear_schedule(initial_value)
        elif isinstance(hyperparams[key], (float, int)):
            # Negative value: ignore (ex: for clipping)
            if hyperparams[key] < 0:
                continue
            hyperparams[key] = constfn(float(hyperparams[key]))

    if args.forced_cliprange is not None:
        hyperparams["cliprange"] = args.forced_cliprange

    agent_class = AVAILABLE_ALGORITHMS[args.agent]
    agent = agent_class(policy, vecEnv, verbose=1, **hyperparams)

    # Prepare callback
    checkpoint_dir = os.path.join(args.output, CHECKPOINT_DIR)
    os.makedirs(checkpoint_dir)
    # Note that save_freq is counted in number of agent step-calls,
    # not env step-calls.
    save_freq = n_timesteps // (args.num_snapshots * n_envs)

    checkpoint_callback = CheckpointCallback(save_freq, checkpoint_dir)

    agent.learn(total_timesteps=n_timesteps, callback=checkpoint_callback)

    vecEnv.close()
Пример #12
0
 def __init__(self, version, envs, hours = 0, verbose = False, weights = None):
     
     self.version = version
     self.name = "football-ppo{}".format(version) + "-e{}"
     self.path = "models/football-ppo-{}/".format(version)
     
     self.defaults = {
         "env_name": "",
         "representation": "simple115",
         "rewards": "scoring",
         "render": False,
         "write_video": False,
         "dump_frequency": 1,
         "extra_players": None,
         "number_of_left_players_agent_controls": 1,
         "number_of_right_players_agent_controls": 0,
         "enable_sides_swap": False,
         "parallel": 1
     }
     
     self.configs = list(map(lambda b: dict(map(lambda a: (a[0], a[1] if a[0] not in b.keys() else b[a[0]]), self.defaults.items())), envs))
     
     self.training = SubprocVecEnv(reduce(lambda a, b: a + b, list(map(lambda config: [
     
         lambda: football.create_environment(
             env_name = config["env_name"],
             representation = config["representation"],
             rewards = config["rewards"],
             render = config["render"],
             write_video = config["write_video"],
             dump_frequency = config["dump_frequency"],
             extra_players = config["extra_players"],
             number_of_left_players_agent_controls = config["number_of_left_players_agent_controls"],
             number_of_right_players_agent_controls = config["number_of_right_players_agent_controls"],
             enable_sides_swap = config["enable_sides_swap"]
         ) for _ in range(config["parallel"])
     
     ], self.configs)), []))
     
     self.inputs = self.training.get_attr("observation_space")[0].shape[0]
     self.outputs = self.training.get_attr("action_space")[0].n
     
     self.verbose = verbose
     
     if not verbose:
         os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 
         deprecation._PRINT_DEPRECATION_WARNINGS = False
         logger = logging.getLogger()
         logger.setLevel(logging.ERROR)
     
     if weights == None:
         self.model = PPO2(policy = MlpPolicy, env = self.training, verbose = int(self.verbose))
     else:
         self.model = PPO2.load(weights, env = self.training, learning_rate = 0.002)
 
     self.experience = hours * 60
def train():
    n_cpu = os.cpu_count()
    env = SubprocVecEnv([lambda: DemoEnv() for i in range(n_cpu)])
    model = PPO2(MlpPolicy,
                 env,
                 verbose=1,
                 policy_kwargs={'net_arch': [dict(vf=[4], pi=[4])]})
    model.learn(total_timesteps=int(1e6))
    model.save("ppo2_DemoEnv")
    env.close()
    del model
Пример #14
0
    def get_rewards(self,
                    skills=[],
                    train_total_timesteps=5000000,
                    eval_times=100,
                    eval_max_steps=10000,
                    model_save_name=None,
                    add_info={}):
        # def get_rewards(self, skills=[], train_total_timesteps=10, eval_times=10, eval_max_steps=10, model_save_name=None, add_info={}):
        """
        
        :param skills: (list) the availiable action sequence for agent 
        e.g [[0,2,2],[0,1,1]]
        :param train_total_timesteps: (int)total_timesteps to train 
        :param eval_times: (int)the evaluation times
        e.g eval_times=100, evalulate the policy by averageing the reward of 100 episode
        :param eval_max_steps: (int)maximum timesteps per episode when evaluate
        :param model_save_name: (str)specify the name of saved model (should not repeat)
        :param add_info: (dict) other information to log in log.txt
        """

        # env = SkillWrapper(self.env, skills=skills)
        if self.num_cpu > 1:
            env = SubprocVecEnv([
                self.make_env(self.env_creator, i, skills)
                for i in range(self.num_cpu)
            ])
        else:
            env = DummyVecEnv([lambda: self.env_creator()])
        model = self.model(self.policy, env, verbose=self.verbose)

        self.strat_time = time.time()
        print("start to train agent...")
        model.learn(total_timesteps=train_total_timesteps,
                    reset_num_timesteps=self.reset_num_timesteps)
        print("Finish train agent")

        if self.save_path is not None:
            if self.preserve_model > 0:
                self.save_model(model, model_save_name, skills=skills)

        # evaluate
        info = self.evaluate(env, model, eval_times, eval_max_steps)
        env.close()

        #log result
        info.update(add_info)
        self.log(info)

        self._serial_num = self._serial_num + 1
        return info["ave_score"], info["ave_action_reward"]
Пример #15
0
def run_training(config: Dict):
    """Runs training based on config passed in"""
    print("Run configuration:")
    print(config)
    seed(config['seed'])

    # read config
    hyperparameters = read_hyperparameters(config)
    graphs = graphs_from_args(config['graphs'])
    policy, policy_kwargs = policy_from_args(config, graphs)
    demands = demands_from_args(config, graphs)
    env_kwargs = env_kwargs_from_args(config)
    env_name = config['env_name']
    timesteps = config['timesteps']
    parallelism = config['parallelism']
    log_name = config['log_name']
    model_name = config['model_name']
    tensorboard_log = config['tensorboard_log']

    oblivious_routings = None

    # make env
    env = lambda: gym.make(env_name,
                           dm_sequence=demands,
                           graphs=graphs,
                           oblivious_routings=oblivious_routings,
                           **env_kwargs)
    vec_env = SubprocVecEnv([env for _ in range(parallelism)],
                            start_method="spawn")

    # make model
    model = PPO2(policy,
                 vec_env,
                 cliprange_vf=-1,
                 verbose=1,
                 policy_kwargs=policy_kwargs,
                 tensorboard_log=tensorboard_log,
                 **hyperparameters)

    # learn
    if env_name == 'ddr-iterative-v0':
        model.learn(total_timesteps=timesteps, tb_log_name=log_name)
    else:
        model.learn(total_timesteps=timesteps, tb_log_name=log_name)

    # save it
    model.save(model_name)

    # make sure everything stopped correctly
    vec_env.close()
Пример #16
0
def main():
    #env = SubprocVecEnv([(lambda i=i: SwocGym(i+1, GameServicePath, i, fieldWidth=10, fieldHeight=10, saveEpisode=True)) for i in range(1)])
    env = SubprocVecEnv([
        (lambda i=i: MazeGym(mazeWidth=10, mazeHeight=10, nrWallsToRemove=10))
        for i in range(1)
    ])
    try:
        model = PPO2("MlpPolicy",
                     env,
                     verbose=1,
                     tensorboard_log='/home/ralph/swoc2019/log')
        if SaveFile.exists():
            print('loading...')
            model.load_parameters(SaveFile)
        else:
            print('Warning: No save file loaded')

        print('evaluating...', end='')
        obs = env.reset()
        totalRewards = None
        for i in range(100):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            totalRewards = totalRewards + rewards if totalRewards is not None else rewards
            env.render()
            sleep(0.2)
        print(f'mean reward: {np.mean(totalRewards)}')

    except KeyboardInterrupt:
        print('closing...')
    finally:
        env.close()
    print('closed')
Пример #17
0
def run():
    torch.multiprocessing.freeze_support()
    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    model = ACKTR(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=25000)

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
Пример #18
0
    def __init__(self, _make_env_func, parallel_agents):
        """
        This class instantiates a dynamics model based on the pybullet simulator
        (i.e: simulates exactly the result of the actions), it can be used
        for reward tuning and verifying tasks..etc

        :param _make_env_func: (func) a function if called it will return a gym
                                      environment.
        :param parallel_agents: (int) number of parallel agents to siumulate
                                      to evaluate the actions.
        """
        self.parallel_agents = parallel_agents
        self.envs = SubprocVecEnv(
            [_make_env_func() for i in range(self.parallel_agents)])
        return
Пример #19
0
class SimulatorModel(object):
    def __init__(self, _make_env_func, parallel_agents):
        """
        This class instantiates a dynamics model based on the pybullet simulator
        (i.e: simulates exactly the result of the actions), it can be used
        for reward tuning and verifying tasks..etc

        :param _make_env_func: (func) a function if called it will return a gym
                                      environment.
        :param parallel_agents: (int) number of parallel agents to siumulate
                                      to evaluate the actions.
        """
        self.parallel_agents = parallel_agents
        self.envs = SubprocVecEnv(
            [_make_env_func() for i in range(self.parallel_agents)])
        return

    def evaluate_trajectories(self, action_sequences):
        """
        A function to be called to evaluate the action sequences and return
        the corresponding reward for each sequence.

        :param action_sequences: (nd.array) actions to be evaluated
                                            (number of sequences, horizon length)
        :return: (nd.array) sum of rewards for each action sequence.
        """
        horizon_length = action_sequences.shape[1]
        num_of_particles = action_sequences.shape[0]
        rewards = np.zeros([num_of_particles])
        assert ((float(num_of_particles) / self.parallel_agents).is_integer())
        for j in range(0, num_of_particles, self.parallel_agents):
            self.envs.reset()
            total_reward = np.zeros([self.parallel_agents])
            for k in range(horizon_length):
                actions = action_sequences[j:j + self.parallel_agents, k]
                task_observations, current_reward, done, info = \
                    self.envs.step(actions)
                total_reward += current_reward
            rewards[j:j + self.parallel_agents] = total_reward
        return rewards

    def end_sim(self):
        """
        Closes the environments that were used for simulation.
        :return:
        """
        self.envs.close()
        return
Пример #20
0
def train(train_data):
    # The algorithms require a vectorized environment to run
    env_train = SubprocVecEnv(
        [lambda: ExchangeEnv.ExchangeEnv(train_data, 10000, 0)])

    # In the paper a policy with a feed forward network with two hidden layers each consisting of 64 neurons was used
    policy_kwargs = dict(net_arch=[64, 64])

    # From the paper:
    #   Lambda = 0.95
    #   Clipping parameter = 0.2
    #   cvf = 0.5
    #   cH = 0.01
    #   Adam minibatch = 4
    #   Learning rate = 0.00025
    #   Trained over 10,000,000 time steps
    model = PPO2(policy=MlpPolicy,
                 env=env_train,
                 policy_kwargs=policy_kwargs,
                 lam=0.95,
                 cliprange=0.2,
                 vf_coef=0.5,
                 ent_coef=0.01,
                 nminibatches=4,
                 learning_rate=0.00025,
                 verbose=1)

    model.learn(total_timesteps=10000000)
    model.save('ppo2_trader')
Пример #21
0
def train():
    if not os.path.isdir("log/"):
        os.mkdir("log")

    if ENV_COUNT == 1:
        envs = create_env_headless()
        env_id = str(time.time())[-6:]
        envs = Monitor(envs,
                       "log/" + MODEL_NAME + "-" + env_id,
                       allow_early_resets=False)
        vec_envs = DummyVecEnv([lambda: envs])
    else:
        vec_envs = []

        def make_env():
            env_id = str(time.time())[-6:]
            env = create_env_headless()
            return Monitor(env,
                           "log/" + MODEL_NAME + "-" + env_id,
                           allow_early_resets=False)

        for _ in range(ENV_COUNT):
            vec_envs.append(make_env)
        vec_envs = SubprocVecEnv(vec_envs)

    model = PPO2('CnnPolicy',
                 vec_envs,
                 verbose=1,
                 ent_coef=0.0001,
                 n_steps=256)
    model.learn(total_timesteps=TIMESTEPS)
    model.save(MODEL_NAME)
    vec_envs.close()

    print("Learning Done!")
Пример #22
0
def run_experiment(exp_num, exp_type, variants, n_cpu, step_total, exp_log,
                   log_dict, drive, og_dir):
    model_names = []
    run_path = ''
    for order, variant in enumerate(variants):
        alter_env(exp_type, variant)
        env = gym.make("Real-v0")
        env = Monitor(env, 'tf_save', allow_early_resets=True)
        env = SubprocVecEnv([lambda: env for i in range(n_cpu)])
        if order == 0:
            model = PPO2(MlpPolicy,
                         env,
                         verbose=0,
                         tensorboard_log="./tensorboard_log/",
                         drive=drive,
                         og_dir=og_dir)
        else:
            pydrive_util.download_file(drive, run_path + '/checkpoint')
            load_name = load_checkpoint(-1, run_path)
            pydrive_util.download_file(drive, load_name)
            model = PPO2.load('tmp/tmp_file',
                              env=env,
                              drive=drive,
                              og_dir=og_dir)
        model_names.append(model.model_name)
        run_path = model.graph_dir
        model.learn(total_timesteps=step_total)
        pydrive_util.upload_file(drive, model.checkpoint_log)
        env.close()
        del model, env
    log_experiments(exp_num, exp_type, variants, model_names, exp_log,
                    log_dict, drive)
Пример #23
0
    def train(self,
              game,
              state,
              num_e=1,
              n_timesteps=25000000,
              save='default2'):
        self.create_envs(game_name=game, state_name=state, num_env=num_e)
        #self.model = PPO2.load("default2", SubprocVecEnv(self.env_fns), policy=CnnPolicy, tensorboard_log="./sonic/" )
        #self.model = PPO2(CnnPolicy, SubprocVecEnv(self.env_fns), learning_rate=1e-5, verbose=1,tensorboard_log="./sonic/" )

        self.model = PPO2(policy=CnnPolicy,
                          env=SubprocVecEnv(self.env_fns),
                          n_steps=8192,
                          nminibatches=8,
                          lam=0.95,
                          gamma=0.99,
                          noptepochs=4,
                          ent_coef=0.001,
                          learning_rate=lambda _: 2e-5,
                          cliprange=lambda _: 0.2,
                          verbose=1,
                          tensorboard_log="./sonic/")
        self.model.learn(n_timesteps)
        self.model.save(save)
        self.model.learn(n_timesteps)
        self.model.save(save + '2')
        self.model.learn(n_timesteps)
        self.model.save(save + '3')
        self.model.learn(n_timesteps)
        self.model.save(save + '4')
Пример #24
0
def get_multi_process_env(model_settings, model_path, num_of_envs, ckpt_step):
    def _make_env(rank):
        def _init():
            task = generate_task(
                model_settings['benchmarks']['task_generator_id'],
                **model_settings['task_configs'])
            env = CausalWorld(task=task,
                              **model_settings['world_params'],
                              seed=model_settings['world_seed'] + rank)
            env = CurriculumWrapper(
                env,
                intervention_actors=model_settings["intervention_actors"],
                actives=model_settings["actives"])
            if ckpt_step is None:
                prefix = 0
            else:
                prefix = ckpt_step
            monitor_file = os.path.join(model_path,
                                        str(rank) + '_' + str(prefix))
            env = Monitor(env,
                          filename=monitor_file,
                          info_keywords=('fractional_success', ))

            return env

        return _init

    return SubprocVecEnv([_make_env(rank=i) for i in range(num_of_envs)])
Пример #25
0
def train(env_id="highway-v0",
          num_cpu=4,
          log_dir=None,
          n_steps=1e3,
          log_step=100):

    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
    model = PPO2("MlpPolicy", env, verbose=1, n_steps=16)

    for i in trange(int(n_steps // log_step)):
        model.learn(total_timesteps=int(log_step))
        model.save(os.path.join(logdir, f"highway_{i}"))

        env1 = gym.make(env_id)
        model1 = PPO2.load(os.path.join(logdir, f"highway_{i}"))
        obs = env1.reset()
        net_reward = 0
        for j in range(1000):
            action, _states = model1.predict(obs)
            # print("Action:",action)
            obs, rewards, dones, info = env1.step(action)
            net_reward += rewards
            print("rewards")
            env1.render()
            if dones:
                file_writer.add_scalar('Episode Reward', net_reward,
                                       i * log_step)
                file_writer.add_scalar('Episode Length', j, i * log_step)
                break

        del env1, model1
def test_lstm_train():
    """Test that LSTM models are able to achieve >=150 (out of 500) reward on CartPoleNoVelEnv.

    This environment requires memory to perform well in."""
    def make_env(i):
        env = CartPoleNoVelEnv()
        env = TimeLimit(env, max_episode_steps=500)
        env = bench.Monitor(env, None, allow_early_resets=True)
        env.seed(i)
        return env

    env = SubprocVecEnv([lambda: make_env(i) for i in range(NUM_ENVS)])
    env = VecNormalize(env)
    model = PPO2(MlpLstmPolicy, env, n_steps=128, nminibatches=NUM_ENVS, lam=0.95, gamma=0.99,
                 noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, verbose=1)

    eprewmeans = []
    def reward_callback(local, _):
        nonlocal eprewmeans
        eprewmeans.append(safe_mean([ep_info['r'] for ep_info in local['ep_info_buf']]))

    model.learn(total_timesteps=100000, callback=reward_callback)

    # Maximum episode reward is 500.
    # In CartPole-v1, a non-recurrent policy can easily get >= 450.
    # In CartPoleNoVelEnv, a non-recurrent policy doesn't get more than ~50.
    # LSTM policies can reach above 400, but it varies a lot between runs; consistently get >=150.
    # See PR #244 for more detailed benchmarks.

    average_reward = sum(eprewmeans[-NUM_EPISODES_FOR_SCORE:]) / NUM_EPISODES_FOR_SCORE
    assert average_reward >= 150, "Mean reward below 150; per-episode rewards {}".format(average_reward)
Пример #27
0
def train_policy(num_of_envs, log_relative_path, maximum_episode_length,
                 skip_frame, seed_num, sac_config, total_time_steps,
                 validate_every_timesteps, task_name):
    def _make_env(rank):
        def _init():
            task = generate_task(task_generator_id=task_name)
            env = CausalWorld(task=task,
                              skip_frame=skip_frame,
                              enable_visualization=False,
                              seed=seed_num + rank,
                              max_episode_length=maximum_episode_length)
            env = HERGoalEnvWrapper(env)
            return env

        set_global_seeds(seed_num)
        return _init

    os.makedirs(log_relative_path)
    env = SubprocVecEnv([_make_env(rank=i) for i in range(num_of_envs)])
    model = HER('MlpPolicy',
                env,
                SAC,
                verbose=1,
                policy_kwargs=dict(layers=[256, 256, 256]),
                **sac_config)
    save_config_file(sac_config,
                     _make_env(0)(),
                     os.path.join(log_relative_path, 'config.json'))
    for i in range(int(total_time_steps / validate_every_timesteps)):
        model.learn(total_timesteps=validate_every_timesteps,
                    tb_log_name="sac",
                    reset_num_timesteps=False)
    model.save(os.path.join(log_relative_path, 'saved_model'))
    return
Пример #28
0
def train_agent_with_a2c(load=False):
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines.common.vec_env import SubprocVecEnv
    from stable_baselines import A2C

    # multiprocess environment
    n_cpu = 4
    env = SubprocVecEnv([lambda: gym.make('F16GCAS-v0') for i in range(n_cpu)])
    env = gym.make("F16GCAS-v0")

    class CustomPolicy(MlpPolicy):
        def __init__(self, *args, **kwargs):
            super(CustomPolicy, self).__init__(*args, **kwargs,
                                               layers=[128, 128])
    if not load:
        model = A2C(env=env, verbose=1, policy=CustomPolicy)
        # model.learn(total_timesteps=1000000)
        ExpData = ExpertDataset("./lqr_export.npz")
        model.pretrain(ExpData, n_epochs=100)
    else:
        model = A2C.load(ROOT+"/trained_models/TDRL/f16/a2c/128_128", env=env)
        with model.graph.as_default():
            for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'):
                print(i)

    return model
Пример #29
0
def run_model_stablebaseline(flow_params,
                             num_cpus=1,
                             rollout_size=50,
                             num_steps=50):
    """Run the model for num_steps if provided.

    Parameters
    ----------
    num_cpus : int
        number of CPUs used during training
    rollout_size : int
        length of a single rollout
    num_steps : int
        total number of training steps
    The total rollout length is rollout_size.

    Returns
    -------
    stable_baselines.*
        the trained model
    """
    if num_cpus == 1:
        constructor = env_constructor(params=flow_params, version=0)()
        # The algorithms require a vectorized environment to run
        env = DummyVecEnv([lambda: constructor])
    else:
        env = SubprocVecEnv([
            env_constructor(params=flow_params, version=i)
            for i in range(num_cpus)
        ])

    train_model = PPO2('MlpPolicy', env, verbose=1, n_steps=rollout_size)
    train_model.learn(total_timesteps=num_steps)
    return train_model
Пример #30
0
def main():

    env = SubprocVecEnv([lambda: NetworkEnv() for i in range(100)])
    model = PPO2("CustomPolicy", env, verbose=0, gamma=0.2)  #)
    #model = PPO2.load("NetworkModel",env=env)
    model.learn(total_timesteps=10000000)
    model.save("NetworkModel")