예제 #1
0
def attention_render(model_name, env_name, num_cpu, log_dir):
    if not os.path.exists(log_dir):
        raise ('log_dir not Exists')

    env_id = env_name + 'NoFrameskip-v4'
    env = SubprocVecEnv([make_env(env_id, i, log_dir) for i in range(num_cpu)])
    # env = Monitor(env, log_dir, allow_early_resets=True)

    if model_name == 'A2C_Attention':
        model = A2C(AttentionPolicy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    elif model_name == 'A2C_Attention2':
        model = A2C(Attention2Policy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    elif model_name == 'A2C':
        model = A2C(LstmPolicy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    else:
        model = None
    model = model.load(log_dir + model_name + '_' + env_name, env=env)

    obs = env.reset()
    # print(env.observation_space)
    # cv2.imshow('test', RGB2BGR(obs[0]))
    # cv2.waitKey(0)
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        attentions = model.get_attention(obs, _states, done)[0]
        attentions_img = []
        # print('attention', np.array(attention).shape)
        for i, attention in enumerate(attentions):
            attention = np.array(attention)
            attention = np.reshape(attention, [
                env.observation_space.shape[0] // 10,
                env.observation_space.shape[1] // 10, 1
            ])
            attention = np.repeat(attention, [10] * attention.shape[0], axis=0)
            attention = np.repeat(attention, [10] * attention.shape[1], axis=1)
            attention = attention * 255
            attentions_img.append(attention)
            # print(np.sum(attention))
        attentions = tile_images(attentions_img)
        cv2.imshow('attention', attentions)
        cv2.waitKey(1)
        # break
        env.render()
    return model
예제 #2
0
 def build_model(self):
     if self.is_stack:
         if self.game_type == "box":
             self.env = DummyVecEnv([lambda: self.env])
             self.model = A2C(MlpPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr)
         if self.game_type == "atari":
             self.model = A2C(CnnPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr)
     else:
         if self.game_type == "box":
             self.env = DummyVecEnv([lambda: self.env])
             self.model = A2C(MlpPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr)
         if self.game_type == "atari":
             self.model = A2C(CnnLstmPolicy, self.env, verbose=0, gamma=self.gamma, learning_rate =self.actor_lr, ent_coef=self.c2,vf_coef=self.critic_lr)
예제 #3
0
def get_model(model_name, env, log_dir):
    if model_name == "A2C_DualAttention":
        model = A2C(DualAttentionLstmPolicy, env, verbose=1)
    elif model_name == "A2C_SelfAttention_Cin":
        model = A2C(SelfAttentionCinLstmPolicy, env, verbose=1)
    elif model_name == "A2C_SelfAttention":
        model = A2C(SelfAttentionLstmPolicy, env, verbose=1)
    elif model_name == 'A2C_Attention':
        model = A2C(AttentionPolicy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    elif model_name == 'A2C_Attention2':
        model = A2C(Attention2Policy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    elif model_name == 'A2C_Attention3':
        model = A2C(Attention3Policy, env, verbose=1)
    elif model_name == 'A2C_Attention4':
        model = A2C(Attention4Policy, env, verbose=1)
    elif model_name == 'A2C':
        model = A2C(CnnLstmPolicy, env, verbose=1)
    else:
        raise ('{} Not Exist'.format(model_name))
    return model
예제 #4
0
def a2c(env, seed):
    return A2C('MlpPolicy',
               env,
               learning_rate=0.001,
               verbose=1,
               tensorboard_log="./data/runs",
               seed=seed)
예제 #5
0
def get_a2c(vec_env=None,
            policy='CnnPolicy',
            learning_rate=7e-4,
            momentum=0.0,
            alpha=0.99,
            epsilon=1e-5,
            max_grad_norm=0.5,
            lr_schedule='constant') -> A2C:
    """
    Parameter's default values are taken from stable_baselines.a2c.a2c.py
    """
    if vec_env is None:
        vec_env = create_training_env(1)
    return A2C(policy=policy,
               env=vec_env,
               gamma=0.99,
               n_steps=5,
               vf_coef=0.25,
               ent_coef=0.01,
               max_grad_norm=max_grad_norm,
               learning_rate=learning_rate,
               alpha=alpha,
               momentum=momentum,
               epsilon=epsilon,
               lr_schedule=lr_schedule,
               verbose=2)
예제 #6
0
def model_training_learning(env_train, model_name, timesteps=100000):

    # train model
    os.chdir("./model_saved/" + model_name)
    start = time.time()
    print("Train ", model_name, " Model with MlpPolicy: ")

    if model_name == "A2C_Model":
        model = A2C('MlpPolicy', env_train, verbose=0)
    elif model_name == "PPO_Model":
        model = PPO2('MlpPolicy', env_train, verbose=0)
    elif model_name == "TD3_Model":
        model = TD3('MlpPolicy', env_train, verbose=0)
    elif model_name == "SAC_Model":
        model = SAC('MlpPolicy', env_train, verbose=0)

    print("Learning ", model_name, " time steps: ", timesteps)

    model.learn(total_timesteps=timesteps)
    print("TD3 Model learning completed: ")
    end = time.time()
    timestamp = time.strftime('%b-%d-%Y_%H%M')
    model_file_name = (model_name + timestamp)
    model.save(model_file_name)
    print("- ", model_name, " save finish     :")
    print("Training time  ", model_name, " : ", (end - start) / 60, " minutes")

    os.chdir("./..")
    os.chdir("./..")
    return model
예제 #7
0
def train(game, num_timesteps, num_envs, dir_name, model_name,
          prev_model_name):
    dir_name = get_valid_filename(dir_name)
    model_name = get_valid_filename(model_name)

    log_dir = f"logs/{dir_name}/{model_name}-training"
    model_dir = f"models/{dir_name}"
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    env = make_vec_envs(game, False, num_envs)
    prev_model_path = f"{model_dir}/{prev_model_name}.zip"
    if prev_model_name is not None and os.path.exists(prev_model_path):
        model = A2C.load(prev_model_path, env=env)
        model.tensorboard_log = log_dir
    else:
        model = A2C(policy="MlpPolicy",
                    env=env,
                    gamma=0.8,
                    n_steps=64,
                    learning_rate=0.00025,
                    verbose=1,
                    tensorboard_log=log_dir)
    model.learn(num_timesteps)
    model.save(f"{model_dir}/{model_name}.zip")
    env.close()
예제 #8
0
def _train(env_id, agent, model_params, total_steps, is_evaluation=False):
    if is_evaluation:  # evaluate_policy() must only take one environment
        envs = SubprocVecEnv([make_env(env_id)])
    else:
        envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)])
    envs = VecNormalize(
        envs)  # normalize the envs during training and evaluation

    # Load pretrained model during training.
    if not is_evaluation and os.path.exists(agent + '_' + env_id):
        if agent == 'ppo2':
            model = PPO2.load(agent + '_' + env_id)
        elif agent == 'a2c':
            model = A2C.load(agent + '_' + env_id)
    else:
        if agent == 'ppo2':
            model = PPO2(MlpLstmPolicy,
                         envs,
                         nminibatches=1,
                         verbose=1,
                         **model_params)
        elif agent == 'a2c':
            model = A2C(MlpLstmPolicy, envs, verbose=1, **model_params)

    model.learn(total_timesteps=total_steps)
    return envs, model
예제 #9
0
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env):
    """
    Train A2C model for atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param num_env: (int) The number of environments
    """
    policy_fn = None
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = CnnLstmPolicy
    elif policy == 'lnlstm':
        policy_fn = CnnLnLstmPolicy
    if policy_fn is None:
        raise ValueError("Error: policy {} not implemented".format(policy))

    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)

    model = A2C(policy_fn, env, lr_schedule=lr_schedule, seed=seed)
    model.learn(total_timesteps=int(num_timesteps * 1.1))
    env.close()
def define_model(env, log_dir):
    if DEFAULT:
        policy_kwargs = dict()
    else:
        policy_kwargs = dict(act_fun=ACT_FUN, net_arch=NET_ARCH)

    if ALGORITHM == 'ppo2':
        model = PPO2(policy=MlpPolicy,
                     env=env,
                     policy_kwargs=policy_kwargs,
                     verbose=0,
                     tensorboard_log=log_dir)

    elif ALGORITHM == 'a2c':
        model = A2C(policy=MlpPolicy,
                    env=env,
                    policy_kwargs=policy_kwargs,
                    verbose=0,
                    tensorboard_log=log_dir)
    else:
        raise Exception('Specify proper algorithm')

    model_arch = model.get_parameter_list()
    print('\n--------------- Summary of archs ---------------')
    for model_param in model_arch:
        print(model_param)
    print('\n')

    return model
def test_monitor():
    env = gym.make('Pendulum-v0')
    env = Monitor(gym.make('Pendulum-v0'), filename=None, allow_early_resets=True)
    normalized_env = NormalizeActionWrapper(env)
    normalized_env = DummyVecEnv([lambda: normalized_env])
    # model
    model_2 = A2C('MlpPolicy', normalized_env, verbose=1).learn(1000)
예제 #12
0
def load(config, agent, epoch, from_disk=True):
    config = config['ai']
    if not config['enabled']:
        logging.info("ai disabled")
        return False

    logging.info("[ai] bootstrapping dependencies ...")

    from stable_baselines import A2C
    from stable_baselines.common.policies import MlpLstmPolicy
    from stable_baselines.common.vec_env import DummyVecEnv

    import pwnagotchi.ai.gym as wrappers

    env = wrappers.Environment(agent, epoch)
    env = DummyVecEnv([lambda: env])

    logging.info("[ai] bootstrapping model ...")

    a2c = A2C(MlpLstmPolicy, env, **config['params'])

    if from_disk and os.path.exists(config['path']):
        logging.info("[ai] loading %s ..." % config['path'])
        a2c.load(config['path'], env)
    else:
        logging.info("[ai] model created:")
        for key, value in config['params'].items():
            logging.info("      %s: %s" % (key, value))

    return a2c
예제 #13
0
def train():
    """Trains an A2C policy """
    env = create_env()

    model = A2C(
        policy = CnnPolicy,
        env = env,
        gamma = 0.99,
        n_steps = 5,
        vf_coef=0.25, 
        ent_coef=0.01,
        max_grad_norm=0.5,
        learning_rate=7e-4,
        alpha=0.99,
        epsilon=1e-05,
        lr_schedule='constant',
        verbose=1,
        tensorboard_log="./tb"  
    )

    model.learn(
        total_timesteps=int(1e7), 
        callback=callback, 
        tb_log_name="a2c"
    )

    model.save("models/pacman_a2c.pkl")
예제 #14
0
def train_agent_with_a2c(load=False):
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines.common.vec_env import SubprocVecEnv
    from stable_baselines import A2C

    # multiprocess environment
    n_cpu = 4
    env = SubprocVecEnv([lambda: gym.make('F16GCAS-v0') for i in range(n_cpu)])
    env = gym.make("F16GCAS-v0")

    class CustomPolicy(MlpPolicy):
        def __init__(self, *args, **kwargs):
            super(CustomPolicy, self).__init__(*args, **kwargs,
                                               layers=[128, 128])
    if not load:
        model = A2C(env=env, verbose=1, policy=CustomPolicy)
        # model.learn(total_timesteps=1000000)
        ExpData = ExpertDataset("./lqr_export.npz")
        model.pretrain(ExpData, n_epochs=100)
    else:
        model = A2C.load(ROOT+"/trained_models/TDRL/f16/a2c/128_128", env=env)
        with model.graph.as_default():
            for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'):
                print(i)

    return model
예제 #15
0
def train_a2c(seed):
    """
    test A2C on the uav_env(cartesian,discrete) 
    :param seed: (int) random seed for A2C
    """
    """
    A2C(policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, 
    max_grad_norm=0.5, learning_rate=0.0007, alpha=0.99, epsilon=1e-05,
    lr_schedule='linear', verbose=0,tensorboard_log=None, _init_setup_model=True)
    """
    algo = 'A2C'
    num_timesteps = 3000000

    env = set_up_env(seed)

    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0

    model = A2C(policy=MlpPolicy, env=env, gamma=0.99, n_steps=5, vf_coef=0.25,
                ent_coef=0.01, max_grad_norm=0.5, learning_rate=0.0007, alpha=0.99,
                epsilon=1e-05, lr_schedule='linear', verbose=0,
                tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo))

    model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed,
                log_interval=500, tb_log_name="seed_{}".format(seed))

    model = A2C.load(log_dir + 'best_model.pkl')

    evaluation = evaluate_model(env, model, 100)
    os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True)
    os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed))
    env.close()
    del model, env
    gc.collect()
    return evaluation
예제 #16
0
    def __init__(self, method, K=5, P=0.95):
        self.method = method
        self.K = K
        self.state_size = self.K + 1
        self.action_size = self.K + 1
        self.reward = []

        env_name = 'ErdosAttack-v0'

        self.log_dir = "/tmp/gym_attack/"
        os.makedirs(self.log_dir, exist_ok=True)

        env = gym.make(env_name)
        env.init_params(K, P)
        env = Monitor(env, self.log_dir, allow_early_resets=True)
        self.envs = DummyVecEnv([lambda: env])

        if method == 'PPO':
            self.model = PPO2(MLP_PPO, self.envs, verbose=0)
        elif method == 'DQN':
            self.model = DQN(MLP_DQN, self.envs, verbose=0)
        elif method == 'A2C':
            self.model = A2C(MLP_A2C, self.envs, verbose=0)
        else:
            raise Exception("Erreur ! Méthode: 'PPO' ou 'DQN' ou 'A2C")
        print("Model Initialized !")

        self.best_mean_reward, self.n_steps = -np.inf, 0
예제 #17
0
def test_evaluate_policy():
    model = A2C('MlpPolicy', 'Pendulum-v0', seed=0)
    n_steps_per_episode, n_eval_episodes = 200, 2
    model.n_callback_calls = 0

    def dummy_callback(locals_, _globals):
        locals_['model'].n_callback_calls += 1

    _, episode_lengths = evaluate_policy(model,
                                         model.get_env(),
                                         n_eval_episodes,
                                         deterministic=True,
                                         render=False,
                                         callback=dummy_callback,
                                         reward_threshold=None,
                                         return_episode_rewards=True)

    n_steps = sum(episode_lengths)
    assert n_steps == n_steps_per_episode * n_eval_episodes
    assert n_steps == model.n_callback_calls

    # Reaching a mean reward of zero is impossible with the Pendulum env
    with pytest.raises(AssertionError):
        evaluate_policy(model,
                        model.get_env(),
                        n_eval_episodes,
                        reward_threshold=0.0)

    episode_rewards, _ = evaluate_policy(model,
                                         model.get_env(),
                                         n_eval_episodes,
                                         return_episode_rewards=True)
    assert len(episode_rewards) == n_eval_episodes
예제 #18
0
def run_agent(envs, parameters):
    '''Train an agent.'''
    alg = parameters['alg']
    learning_rate = parameters['learning_rate']
    gamma = parameters['gamma']
    model_path = parameters['model_path']
    set_global_seeds(parameters.get('seed'))
    dummy_env = OptVecEnv(envs)
    if alg == 'PPO':
        model = PPO2(MlpPolicy,
                     dummy_env,
                     gamma=gamma,
                     learning_rate=learning_rate,
                     verbose=1,
                     nminibatches=dummy_env.num_envs)
    elif alg == 'A2C':
        model = A2C(MlpPolicy,
                    dummy_env,
                    gamma=gamma,
                    learning_rate=learning_rate,
                    verbose=1)
    else:
        model = DDPG(ddpg.MlpPolicy,
                     dummy_env,
                     gamma=gamma,
                     verbose=1,
                     actor_lr=learning_rate / 10,
                     critic_lr=learning_rate)
    try:
        model.learn(total_timesteps=parameters.get('total_timesteps', 10**6))
    except tf.errors.InvalidArgumentError:
        LOGGER.error('Possible Nan, %s', str((alg, learning_rate, gamma)))
    finally:
        dummy_env.close()
        model.save(str(model_path))
예제 #19
0
def run():
    sum_rewards = []
    good_reward = False
    # env = SubprocVecEnv([lambda: simulation.atc_gym.AtcGym() for i in range(n_cpu)])
    # env =  DummyVecEnv([lambda : simulation.atc_gym.AtcGym()])
    env = DummyVecEnv([lambda: simulation.simulator.AirplaneSimulator()])

    # model = PPO2(MlpPolicy, env, verbose=1)
    model = A2C(MlpPolicy, env, verbose=1, tensorboard_log='./')
    model.learn(total_timesteps=num_train_steps)
    obs = env.reset()
    for i in range(num_eval_steps):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        sum_rewards.append(rewards[0])
        if done:
            obs = env.reset()
        #print("Reward: ", rewards, "Done: ", done)
        if rewards[0] != -100:
            print("Reward: ", rewards, "Done: ", done)
            good_reward = True
        #env.render()

    if not good_reward:
        print("Boo")

    print("Average reward across {0} steps: {1}".format(
        num_eval_steps,
        sum(sum_rewards) / len(sum_rewards)))
예제 #20
0
    def optimize_agent(trial):
        """ Train the model and optimise
          Optuna maximises the negative log likelihood, so we
          need to negate the reward here
      """
        model_params = optimize_a2c(trial)
        seed = trial.suggest_int('numpyseed', 1, 2147483647)
        np.random.seed(seed)
        original_env = gym.make('rustyblocks-v0',
                                use_cnn=use_cnn,
                                simple_reward=simple_reward)
        env = DummyVecEnv([lambda: original_env])
        policy = "CnnPolicy" if use_cnn else "MlpPolicy"
        policy_kwargs = dict(
            net_arch=[dict(pi=[128, 128, 128], vf=[128, 128, 128])])
        model = A2C(policy,
                    env,
                    verbose=0,
                    policy_kwargs=policy_kwargs,
                    **model_params)
        print("DOING LEARING a2c")
        original_env.force_progression = False
        has_nan = False

        def learn_callback(a, b):
            has_nan = np.isnan(a["actions"]).any()
            return not has_nan

        model.learn(int(2e4 * 5), seed=seed, callback=learn_callback)
        print("DONE LEARING a2c, wins gotten:", original_env.wins)
        if has_nan:
            trial.report(-15.0)
            print("ERRORED WITH NAN")
            return -15.0
        rewards = []
        n_episodes, reward_sum = 0, 0.0

        obs = env.reset()
        original_env.wins = 0
        start = time.time()
        while n_episodes < 1000:
            action, _ = model.predict(obs)
            obs, reward, done, _ = env.step(action)
            reward_sum += reward

            if done:
                rewards.append(reward_sum)
                reward_sum = 0.0
                n_episodes += 1
                obs = env.reset()
        end = time.time()
        last_reward = np.mean(rewards)
        trial.report(last_reward)
        print(
            "done testing parameters average reward and wins and time_elapsed are:",
            last_reward, original_env.wins, end - start)

        return last_reward
예제 #21
0
def main(experiment_params):


    # define experiment params
    with open(experiment_params, "rb") as f:
       params = json.load(f)

    global eval_increment, rh_kwargs, rh_schedule, model, eval_env # global variables for callback fn
    experiment = params['experiment_name']
    env_name = params['env_name']
    rh_kwargs = params['rh_kwargs']
    rh_schedule = {int(k): v for k,v in params['rh_schedule'].items()}
    verbose_env = params['verbose_env']
    total_steps = params['total_steps']
    train_subset = params['train_subset']
    eval_increment = params['eval_increment']


    # setup env
    register(
        id=env_name,
        entry_point=f'gym_summarizer.envs:{env_name.split("-")[0]}',
    )

    reward_helper = RewardHelper(**rh_schedule[0], **rh_kwargs)

    env = gym.make(env_name, reward_helper=reward_helper, verbose=verbose_env)
    env.data_loader.early_stop = train_subset
    env = DummyVecEnv([lambda: env])

    # define callback function and evaluation env
    eval_dataloader = BatchCNNDMLoader('data/finished_files/test/')
    eval_env = gym.make(env_name, data_loader=eval_dataloader,
                        reward_helper=RewardHelper('average', 'f', is_terminal=True),
                        verbose=False)



    # define model and learn
    model = A2C(MlpPolicy, env, tensorboard_log=f"experiments/{experiment}/", verbose=0, n_steps=2)
    model.learn(total_timesteps=total_steps, callback=reward_schedule_callback)

    # save model and callbacks output
    model.save(f"{experiment}.model")
    with open(f"experiments/{experiment}_returns.pkl", 'wb') as f:
        pickle.dump(returns, f)

    with open(f"experiments/{experiment}_eval.pkl", 'wb') as f:
        pickle.dump(eval_scores, f)

    # plot returns
    df = pd.DataFrame(returns)
    plt.plot(df[0], 'lightblue', df[0].rolling(1000).mean(), 'blue')
    plt.title(f'Training: {experiment}')
    plt.xlabel('Num Episodes')
    plt.ylabel('Episode Reward')
    plt.legend(['Raw', 'Smoothed'])
    plt.show()
예제 #22
0
def load(config, agent, epoch, from_disk=True):
    config = config['ai']
    if not config['enabled']:
        logging.info("ai disabled")
        return False

    try:
        begin = time.time()

        logging.info("[ai] bootstrapping dependencies ...")

        start = time.time()
        from stable_baselines import A2C
        logging.debug("[ai] A2C imported in %.2fs" % (time.time() - start))

        start = time.time()
        from stable_baselines.common.policies import MlpLstmPolicy
        logging.debug("[ai] MlpLstmPolicy imported in %.2fs" %
                      (time.time() - start))

        start = time.time()
        from stable_baselines.common.vec_env import DummyVecEnv
        logging.debug("[ai] DummyVecEnv imported in %.2fs" %
                      (time.time() - start))

        start = time.time()
        import pwnagotchi.ai.gym as wrappers
        logging.debug("[ai] gym wrapper imported in %.2fs" %
                      (time.time() - start))

        env = wrappers.Environment(agent, epoch)
        env = DummyVecEnv([lambda: env])

        logging.info("[ai] creating model ...")

        start = time.time()
        a2c = A2C(MlpLstmPolicy, env, **config['params'])
        logging.debug("[ai] A2C created in %.2fs" % (time.time() - start))

        if from_disk and os.path.exists(config['path']):
            logging.info("[ai] loading %s ..." % config['path'])
            start = time.time()
            a2c.load(config['path'], env)
            logging.debug("[ai] A2C loaded in %.2fs" % (time.time() - start))
        else:
            logging.info("[ai] model created:")
            for key, value in config['params'].items():
                logging.info("      %s: %s" % (key, value))

        logging.debug("[ai] total loading time is %.2fs" %
                      (time.time() - begin))

        return a2c
    except Exception as e:
        logging.exception("error while starting AI")

    logging.warning("[ai] AI not loaded!")
    return False
예제 #23
0
def load(agent, from_disk=True):
    try:
        env = wrappers.Environment(agent)
        env = DummyVecEnv([lambda: env])
        a2c = A2C(MlpLstmPolicy, env)
        return a2c
    except Exception as e:
        logging.exception("error while starting AI")
    return False
예제 #24
0
def construct_model(seed: int) -> A2C:
    return A2C(MlpPolicy,
               env,
               verbose=0,
               tensorboard_log="/tmp/a2c/",
               ent_coef=0.0,
               gamma=0.95,
               n_cpu_tf_sess=1,
               seed=seed)
예제 #25
0
def train_a2c(env_name='fb-v0', e=0):
    # multiprocess environment
    n_cpu = 4
    env = SubprocVecEnv([lambda: BookmakerEnv(data_path='csv_files/train-12122019.json', seed=i) for i in range(n_cpu)])

    model = A2C(MlpPolicy, env, verbose=2,
                tensorboard_log='tensorboard')
    model.learn(total_timesteps=100000000, seed=1)
    model.save(f"saved/{A2C.__name__}_{env_name}_{e}")
예제 #26
0
def launch_training(nb_cpu,name_agent,name_env,total_timesteps,text):

    env_name = name_env
    #n_cpu = 8
    n_cpu = nb_cpu

    policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[512,512])

    print('TB available at := ',tensorboard_log_dir, file=sys.stderr)
    if name_agent =='A2C':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)])
        model = A2C(MlpPolicy, env, n_steps=20,gamma = 0.9, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "A2C_default_Mlp"+text
    elif name_agent == 'PPO2':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = SubprocVecEnv([lambda: env_ for i in range(n_cpu)])
        model = PPO2(MlpPolicy, env,n_steps=80,gamma = 0.97, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "PPO2_default_Mlp"+text
    elif name_agent == 'TRPO':
        env_ = FluidMechanicsEnv()
        env_ = Monitor(env_, console_log_dir,allow_early_resets=True)

        env = DummyVecEnv([lambda: env_ for i in range(n_cpu)])

        model = TRPO(MlpPolicy, env,gamma = 0.1, verbose=1,tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs)
        #model = A2C.load("first_test")
        model_name = "TRPO_default_Mlp"+text


    time = datetime.now().strftime('%Y-%m-%d_%H_%M_%S')

    log_name = f"_model={model_name}_time={time}"
    print('with the following line := ','tensorboard --logdir ',tensorboard_log_dir+log_name)
    training_log = open(f"{console_log_dir}/{log_name}.log", "a")
    sys.stdout = training_log
    logging.basicConfig(level=logging.INFO, filename=f"{console_log_dir}/{log_name}.log", datefmt='%H:%M:%S',
                        format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s')
    model_file_name = f"{models_log_dir}{log_name}_best.pkl"


    start = datetime.now()
    print("Learning model", file=sys.stderr)

    model.learn(total_timesteps=int(total_timesteps), tb_log_name=log_name, callback=callback)

    training_time = datetime.now() - start
    print(f"Training time: {training_time}", file=sys.stderr)

    print("Saving final model", file=sys.stderr)
    model.save(f"{models_log_dir}{log_name}_final.pkl")
예제 #27
0
    def eval_model(model, test_env_id):
        global eval_step, THRESHOLD
        test_success, curriculum_success = True, True
        performance_data[eval_step] = {}
        for env_id in env_ids:
            write_out(
                "[MODEL EVAL]\tTesting learner on env: {}".format(env_id))
            env, eval_env, eval_callback = init_env(env_id)

            fresh_model = A2C(CnnPolicy, env, verbose=verbose)
            fresh_model.learn(total_timesteps=max_steps,
                              callback=eval_callback)

            fresh_mean, fresh_std = evaluate_policy(fresh_model,
                                                    eval_env,
                                                    n_eval_episodes=100)
            model_mean, model_std = evaluate_policy(model,
                                                    eval_env,
                                                    n_eval_episodes=100)
            performance_data[eval_step][env_id] = {
                'baseline_mean': fresh_mean,
                'baseline_std': fresh_std,
                'model_mean': model_mean,
                'model_std': model_std,
                'baseline_training_steps': max_steps,
                'eval_episodes': 100
            }
            write_out(
                "[MODEL EVAL: LEARNER] \t env_id: {}, Mean Reward: {}, std_dev: {}"
                .format(env_id, model_mean, model_std))
            write_out(
                "[MODEL EVAL: BASELINE]\t env_id: {}, Mean Reward: {}, std_dev: {}"
                .format(env_id, fresh_mean, fresh_std))

            pass_test = round(model_mean - model_std, 3) >= round(
                fresh_mean - fresh_std, 3)
            diff = abs(
                round(model_mean - model_std, 3) -
                round(fresh_mean - fresh_std, 3))
            if pass_test:
                write_out(
                    "[TEST RESULT]\tmodel out-performs fresh model for env: {}, diff: {}"
                    .format(env_id, diff))
            else:
                write_out(
                    "[TEST RESULT]\tmodel DID NOT out-perform fresh model for env: {}, diff: {}"
                    .format(env_id, diff))
                if env_id == test_env_id:
                    test_success = False

        curriculum_success = sum([
            performance_data[eval_step][env_id]['baseline_mean'] > THRESHOLD
            for env_id in env_ids
        ]) == len(env_ids)
        eval_step += 1
        return test_success, curriculum_success
예제 #28
0
def make_new_model(model_type, policy, env, tensorboard_log=None):
    if model_type.lower() == 'dqn':
        model = DQN(policy, env, tensorboard_log=tensorboard_log)
    elif model_type.lower() == 'ppo2':
        model = PPO2(policy, env, tensorboard_log=tensorboard_log)
    elif model_type.lower() == 'a2c':
        model = A2C(policy, env, tensorboard_log=tensorboard_log)
    elif model_type.lower() == 'acktr':
        model = ACKTR(policy, env, tensorboard_log=tensorboard_log)
    return model
예제 #29
0
def make_model(config, env, action_noise_fun):
    model = None

    if config["algo_name"] == "A2C" and config["policy_name"] == "MlpPolicy":
        model = A2C(config["policy_name"],
                    env=env,
                    gamma=config["gamma"],
                    n_steps=config["n_steps"],
                    vf_coef=config["vf_coef"],
                    ent_coef=config["ent_coef"],
                    max_grad_norm=config["max_grad_norm"],
                    learning_rate=config["sb2_learning_rate"],
                    alpha=config["alpha"],
                    epsilon=config["epsilon"],
                    lr_schedule=config["lr_schedule"],
                    verbose=config["verbose"],
                    tensorboard_log="./tb/{}/".format(config["session_ID"]),
                    full_tensorboard_log=config["full_tensorboard_log"],
                    policy_kwargs=dict(net_arch=[
                        int(config["policy_hid_dim"]),
                        int(config["policy_hid_dim"])
                    ]))

    if config["algo_name"] == "A2C" and config[
            "policy_name"] == "MlpLstmPolicy":
        model = A2C(CustomLSTMPolicy,
                    env=env,
                    gamma=config["gamma"],
                    n_steps=config["n_steps"],
                    vf_coef=config["vf_coef"],
                    ent_coef=config["ent_coef"],
                    max_grad_norm=config["max_grad_norm"],
                    learning_rate=config["sb2_learning_rate"],
                    alpha=config["alpha"],
                    epsilon=config["epsilon"],
                    lr_schedule=config["lr_schedule"],
                    verbose=config["verbose"],
                    tensorboard_log="./tb/{}/".format(config["session_ID"]),
                    full_tensorboard_log=config["full_tensorboard_log"])

    assert model is not None, "Alg name not found, exiting. "
    return model
예제 #30
0
def train_A2C(env_train, model_name, timesteps=50000):
    """A2C model"""

    start = time.time()
    model = A2C('MlpPolicy', env_train, verbose=0)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (A2C): ', (end - start) / 60, ' minutes')
    return model