Exemplo n.º 1
0
def train(hours):
    conn = Connection()
    env = Monitor(SpireEnv(conn), "./tmp/")
    env.reset()
    logdir = "./tboard_log"
    try:
        model = MODEL_CLASS.load(MODEL_NAME, env=env, tensorboard_log=logdir)
    except FileNotFoundError:
        model = MODEL_CLASS(MlpPolicy, env, tensorboard_log=logdir, **KWARGS)
    start = time.time()

    steps_per_hour = 7000
    steps = steps_per_hour * hours

    callback = TensorboardCallback(env)
    model.learn(total_timesteps=steps,
                reset_num_timesteps=False,
                callback=callback)
    model.save(MODEL_NAME)

    elapsed = time.time() - start
    print(f"{steps} steps processed")
    print(f"{timedelta(seconds=elapsed)} time elapsed")
    print(f"{env.total_floors} floors climbed")
    print(f"{env.total_games} games played")
    if env.total_games > 0:
        print("{:.2f} floors per game".format(env.total_floors /
                                              env.total_games))
Exemplo n.º 2
0
def train():
    best_reward, best_reward_timesteps = None, None
    save_path = "model_save/"+MODEL_PATH+"/"
    if save_path is not None:
        os.makedirs(save_path, exist_ok=True)

    # log_dir = f"model_save/"
    log_dir = save_path
    env, env_eval = ENV(util='train', par=PARAM, dt=DT), ENV(util='val', par=PARAM, dt=DT)
    env, env_eval = Monitor(env, log_dir), Monitor(env_eval, log_dir)
    env, env_eval = DummyVecEnv([lambda: env]), DummyVecEnv([lambda: env_eval])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)

    if PARAM['algo']=='td3':
        model = TD3('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'],
                    learning_starts=PARAM['learning_starts'])
    elif PARAM['algo']=='ddpg':
        model = DDPG('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'],
                     learning_starts=PARAM['learning_starts'])
    elif PARAM['algo']=='ppo':
        model = PPO('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'])

    eval_callback = EvalCallback(env_eval, best_model_save_path=save_path+MODEL_PATH+'_best_model',
                                 log_path=log_dir, eval_freq=PARAM['eval_freq'], save_freq=PARAM['save_freq'],
                                 deterministic=True, render=False)

    model.learn(total_timesteps=int(PARAM['total_time_step']), callback=eval_callback, log_interval = 500)
    print("best mean reward:", eval_callback.best_mean_reward_overall, "timesteps:", eval_callback.best_mean_reward_timestep)
    model.save(save_path+MODEL_PATH+'_final_timesteps')
Exemplo n.º 3
0
def make_envs(env_id, log_dir, gamma, max_train_ep_length, max_eval_ep_length,
              seed):
    """Make training and evaluation environments (vectorized envs)."""

    # Training env
    train_env = gym.make(env_id)
    train_env.seed(seed)  # Set random seed
    train_env = TimeLimitWrapper(
        train_env, max_train_ep_length)  # Limit length of training episodes
    train_env = Monitor(train_env, log_dir)  # Monitor training
    train_env = NormalizeActionWrapper(train_env)  # Normalize action space
    train_env = DummyVecEnv([lambda: train_env])  # Vectorize environment
    train_env = VecNormalize(train_env,
                             gamma=gamma)  # Normalise observations and rewards

    # Eval env
    eval_env = gym.make(env_id)
    eval_env.seed(seed)  # Set random seed
    eval_env = TimeLimitWrapper(
        eval_env,
        max_eval_ep_length)  # Set a maximum number of timesteps during eval
    eval_env = Monitor(
        eval_env
    )  # Used to ensure original action space is not modified by `NormalizeActionWrapper`
    eval_env = NormalizeActionWrapper(eval_env)  # Normalize action space
    eval_env = DummyVecEnv([lambda: eval_env])  # Vectorize environment
    eval_env = VecNormalize(eval_env,
                            gamma=gamma,
                            training=False,
                            norm_reward=False)  # Normalise observations
    # (obs/reward normalization gets synchronised with `train_env` in `EvalCallback`)

    return train_env, eval_env
def _init_envs(image_observations, num_skip_steps, opponent_pred_obs, adversarial_training):
    """ Initialize the environments with the necessaary wrappers for training. Wrappers are determined by settings in the arguments. """
    # In order to ensure symmetry for the agent when playing on either side, change second agent to red, so both have the same color
    if image_observations:
        pong_duel.AGENT_COLORS[1] = 'red'
        # Initialize environment
        train_env = gym.make('PongDuel-v0')
        train_env = RewardZeroToNegativeBiAgentWrapper(train_env)

        train_env_rule_based = ObservationVectorToImage(train_env, 'p1')
        train_env_rule_based = MAGymCompatibilityWrapper(train_env_rule_based, num_skip_steps=num_skip_steps, image_observations='main')

        if adversarial_training is not None:
            train_env_rule_based = AdversarialTrainingWrapper(train_env_rule_based,
                                                              adversarial_probability=adversarial_training,
                                                              img_obs=image_observations)
        train_env_rule_based = Monitor(train_env_rule_based)

        train_env = ObservationVectorToImage(train_env, 'both')
        train_env = MAGymCompatibilityWrapper(train_env, num_skip_steps=num_skip_steps, image_observations='both')

        if adversarial_training is not None:
            train_env = AdversarialTrainingWrapper(train_env,
                                                   adversarial_probability=adversarial_training,
                                                   img_obs=image_observations)
        train_env = Monitor(train_env)

        eval_env_rule_based = gym.make('PongDuel-v0')
        eval_env_rule_based = ObservationVectorToImage(eval_env_rule_based, 'p1')
        eval_env_rule_based = MAGymCompatibilityWrapper(eval_env_rule_based, num_skip_steps=num_skip_steps, image_observations='main')
        eval_op = SimpleRuleBasedAgent(eval_env_rule_based)
        eval_env_rule_based.set_opponent(eval_op)

        eval_env = gym.make('PongDuel-v0')
        eval_env = ObservationVectorToImage(eval_env, 'both')
        eval_env = MAGymCompatibilityWrapper(eval_env, num_skip_steps=num_skip_steps, image_observations='both')
    else:  # Init for feature observations
        train_env = gym.make('PongDuel-v0')
        train_env = ObserveOpponent(train_env, 'both')
        train_env = RewardZeroToNegativeBiAgentWrapper(train_env)
        train_env = MAGymCompatibilityWrapper(train_env, num_skip_steps=num_skip_steps, image_observations='none')
        if opponent_pred_obs:
            train_env = OpponentPredictionObs(train_env)
        if adversarial_training is not None:
            train_env = AdversarialTrainingWrapper(train_env,
                                                   adversarial_probability=adversarial_training,
                                                   img_obs=image_observations)
        train_env = Monitor(train_env)

        eval_env = gym.make('PongDuel-v0')
        eval_env = ObserveOpponent(eval_env, 'both')
        eval_env = MAGymCompatibilityWrapper(eval_env, num_skip_steps=num_skip_steps, image_observations='none')

        # For feature observations we don't need to separate between environment for rule-based and non-rule-based agents
        train_env_rule_based = train_env
        eval_env_rule_based = eval_env
        eval_op = SimpleRuleBasedAgent(eval_env_rule_based)
        eval_env_rule_based.set_opponent(eval_op)

    return eval_env, eval_env_rule_based, eval_op, train_env, train_env_rule_based
 def _init() -> gym.Env:
     env = gym.make(env_id)
     
     # Create folder if needed
     if log_dir is not None:
         os.makedirs(log_dir, exist_ok=True)
     
     env = Monitor(env, log_dir)
     env.seed(seed + rank)
     return env
Exemplo n.º 6
0
def train_ppo(itr=0, timesteps=1e7, use_dummy_video = True):
	env = flappy_env.FlappyEnv(use_dummy_video)
	env = Monitor(env, f"flappy_ppo_{itr}")
	obs = env.reset()
	model = PPO(
		"CnnPolicy", 
		env, 
		verbose=1, 
		learning_rate=1e-5,
		tensorboard_log = f"./ppo_flappy_tensorboard_{itr}/")
	model.learn(total_timesteps = timesteps)
	model.save(f"ppo_flappy_{itr}")
Exemplo n.º 7
0
    def update_env(self,
                   env,
                   support_multi_env: bool = False,
                   eval_env: Optional[GymEnv] = None,
                   monitor_wrapper: bool = True,
                   reset_optimizers: bool = False,
                   **kwargs):
        """
        Replace current env with new env.
        :param env: Gym environment (activated, not a string).
        :param support_multi_env: Whether the algorithm supports training
        with multiple environments (as in A2C)
        :param eval_env: Environment to use for evaluation (optional).
        :param monitor_wrapper: When creating an environment, whether to wrap it
        or not in a Monitor wrapper.
        :param reset_optimizers: Whether to reset optimizers (momentums, etc.).
        :param kwargs: Does nothing, just so more arguments can pass without method failing
        :return:
        """
        if reset_optimizers:
            optimizers = []
            if self.actor is not None:
                optimizers.append(self.actor.optimizer)
            if self.critic is not None:
                optimizers.append(self.critic.optimizer)
            if self.ent_coef_optimizer is not None:
                optimizers.append(self.ent_coef_optimizer)

            # Reset optimizers:
            for i_optimizer, optimizer in enumerate(optimizers):
                optimizer.__init__(optimizer.param_groups[0]['params'])
                optimizers[i_optimizer] = optimizer

        if env is not None:
            if eval_env is not None:
                self.eval_env = eval_env
                if monitor_wrapper:
                    self.eval_env = Monitor(self.eval_env, filename=None)

            if monitor_wrapper:
                env = Monitor(env, filename=None)
            env = self._wrap_env(env, self.verbose)

            self.observation_space = env.observation_space
            self.action_space = env.action_space
            self.n_envs = env.num_envs
            self.env = env

            if not support_multi_env and self.n_envs > 1:
                raise ValueError(
                    "Error: the model does not support multiple envs; it requires "
                    "a single vectorized environment.")
Exemplo n.º 8
0
def train_dqn(itr = 0, timesteps = 1e7, use_dummy_video = True):
	env = flappy_env.FlappyEnv(use_dummy_video)
	env = Monitor(env, f"flappy_dqn_{itr}")
	obs = env.reset()
	model = DQN(
		"CnnPolicy", 
		env, 
		verbose = 1, 
		optimize_memory_usage = True, 
		buffer_size = 500000, 
		learning_rate = 1e-5, 
		tensorboard_log = f"./dqn_flappy_tensorboard_{itr}/")
	model.learn(total_timesteps = timesteps)
	model.save(f"dqn_flappy_{itr}")
Exemplo n.º 9
0
def train_policy_ppo(path='policy_ppo', org_path='prob_ppo'):
    """
    学習済み方策をつかった環境を相手にトレーニングを行う
    引数:
        path        学習済みモデルファイルパス
        org_path    学習元となる方策がロードする学習済みモデルファイルパス
    """
    print(f'train ppo with prob_player path={path}, org_path={org_path}')
    # 学習済みモデルファイルのロード
    model = PPO.load(org_path)

    # じゃんけん環境の構築
    env = RockPaperScissorsEnv(AIPlayer(model))
    env = Monitor(env, LOGDIR, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    # モデルのセット
    model.set_env(env)

    # トレーニング実行
    elapsed = time.time()
    model.learn(total_timesteps=1000000)
    print(f'elapse time: {time.time() - elapsed}sec')

    # 学習済みモデルの保存
    model.save(path)

    # じゃんけん環境のクローズ
    env.close()
Exemplo n.º 10
0
def make_experiment_env(params, train):
    clear = MortalKombat2.\
        make_mortal_kombat2_env(difficulties=params["difficulties"],
                                arenas=params["arenas"],
                                left_players=params["left_players"],
                                right_players=params["right_players"],
                                controllable_players=params["controllable_players"],
                                actions=params["actions"],
                                state_versions=params["state_versions"])
    env = FrameskipWrapper(clear, skip=params["frameskip"])

    if params["max_episode_length"]:
        env = MaxEpLenWrapper(env,
                              max_len=params["params"] // params["frameskip"])

    env = WarpFrame(env, 48, 48)

    if train:
        env = Monitor(env,
                      info_keywords=("P1_rounds", "P2_rounds", "P1_health",
                                     "P2_health", "steps", "difficulty",
                                     "arena", "P1", "P2", "state_version"))
        return env
    else:
        return clear, env, env
Exemplo n.º 11
0
def train_pa_ppo(path='pa_ppo'):
    """
    1/3の確率で出を出す環境での学習を行う。
    引数:
        path    学習済みモデルファイルパス
    戻り値:
        なし
    """
    print(f'train ppo with jurina_player path={path}')
    # じゃんけん環境の構築
    env = RockPaperScissorsEnv(JurinaPlayer())
    env = Monitor(env, LOGDIR, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    # PPOモデルの初期化
    model = PPO('MlpPolicy', env, verbose=1)

    # トレーニング実行
    elapsed = time.time()
    model.learn(total_timesteps=1000000)
    print(f'elapse time: {time.time() - elapsed}sec')

    # 学習済みモデルの保存
    model.save(path)

    # じゃんけん環境のクローズ
    env.close()
Exemplo n.º 12
0
    def _wrap_env(env: GymEnv,
                  verbose: int = 0,
                  monitor_wrapper: bool = True) -> VecEnv:
        """ "
        Wrap environment with the appropriate wrappers if needed.
        For instance, to have a vectorized environment
        or to re-order the image channels.

        :param env:
        :param verbose:
        :param monitor_wrapper: Whether to wrap the env in a ``Monitor`` when possible.
        :return: The wrapped environment.
        """
        if not isinstance(env, VecEnv):
            if not is_wrapped(env, Monitor) and monitor_wrapper:
                if verbose >= 1:
                    print("Wrapping the env with a `Monitor` wrapper")
                env = Monitor(env)
            if verbose >= 1:
                print("Wrapping the env in a DummyVecEnv.")
            env = DummyVecEnv([lambda: env])

        if (is_image_space(env.observation_space)
                and not is_vecenv_wrapped(env, VecTransposeImage)
                and not is_image_space_channels_first(env.observation_space)):
            if verbose >= 1:
                print("Wrapping the env in a VecTransposeImage.")
            env = VecTransposeImage(env)

        # check if wrapper for dict support is needed when using HER
        if isinstance(env.observation_space, gym.spaces.dict.Dict):
            env = ObsDictWrapper(env)

        return env
Exemplo n.º 13
0
def main(config: str, agent: str):
    with open(config) as fp:
        json_data = json.load(fp)

    config = GameConfig.deserialize(json_data)
    log_dir = config.agents_config[agent]["save_path"]
    # if agent == "DQN":
    #     env = make_atari_env(config.game_name, n_envs=1,
    #                          seed=0, monitor_dir=log_dir)

    # elif agent == "PPO":
    #     env = make_atari_env(config.game_name, n_envs=8,
    #                          seed=0, monitor_dir=log_dir)

    # else:
    #     env = make_atari_env(config.game_name, n_envs=16,
    #                          seed=0, monitor_dir=log_dir)

    env = gym_super_mario_bros.make(config.game_name)
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    env = Monitor(env, log_dir)

    # env = VecFrameStack(env, n_stack=4)

    agent = AgentLoader.get_agent(agent, config.agents_config, env)
Exemplo n.º 14
0
def test_her(model_class, online_sampling, image_obs_space):
    """
    Test Hindsight Experience Replay.
    """
    n_bits = 4
    env = BitFlippingEnv(
        n_bits=n_bits,
        continuous=not (model_class == DQN),
        image_obs_space=image_obs_space,
    )

    model = model_class(
        "MultiInputPolicy",
        env,
        replay_buffer_class=HerReplayBuffer,
        replay_buffer_kwargs=dict(
            n_sampled_goal=2,
            goal_selection_strategy="future",
            online_sampling=online_sampling,
            max_episode_length=n_bits,
        ),
        train_freq=4,
        gradient_steps=1,
        policy_kwargs=dict(net_arch=[64]),
        learning_starts=100,
        buffer_size=int(2e4),
    )

    model.learn(total_timesteps=150)
    evaluate_policy(model, Monitor(env))
Exemplo n.º 15
0
def Main():
    pp = pprint.PrettyPrinter(indent=4)

    #make environment and wrap
    env = gym.make('ur5e_reacher-v1')
    env = Monitor(env, filename="logs", allow_early_resets=True)
    #***define model***
    #hyperparams
    # n_actions = env.action_space.shape[-1]
    # action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    model_class = DDPG
    #kwargs are the parameters for DDPG model init
    kwargs = {"device": "cuda", "action_noise": NormalActionNoise}
    model = HER(
        'MlpPolicy',
        env,
        model_class,
        n_sampled_goal=4,
        goal_selection_strategy='future',
        verbose=1,
        learning_rate=0.005,
        online_sampling=True,
        #max_episode_steps=4800
        **kwargs)

    #train model
    train = False
    if train:
        model.learn(2 * 10e5)
        model.save("./her_ur5e_model/model_")

        #load model, not really necessary
    evaluate = True
Exemplo n.º 16
0
def main(do_render: bool, seed: int, as_gdads: bool, name: str,
         do_train: bool):
    drop_abs_position = True

    conf: Conf = CONFS[name]
    dict_env = get_env(name=name,
                       drop_abs_position=drop_abs_position,
                       is_training=True)
    if as_gdads:
        flat_env = SkillWrapper(env=dict_env)
    else:
        flat_env = flatten_env(dict_env, drop_abs_position)
    flat_env = TransformReward(flat_env, f=lambda r: r * conf.reward_scaling)
    flat_env = Monitor(flat_env)

    dict_env = get_env(name=name,
                       drop_abs_position=drop_abs_position,
                       is_training=False)
    if as_gdads:
        use_slider = False
        if use_slider:
            eval_env = SliderWrapper(env=dict_env)
        else:
            eval_env = GDADSEvalWrapper(dict_env,
                                        sw=BestSkillProvider(flat_env))
    else:
        eval_env = flatten_env(dict_env=dict_env,
                               drop_abs_position=drop_abs_position)

    filename = f"modelsCommandSkills/{name}/asGDADS{as_gdads}/resamplingFalse_goalSpaceTrue-seed-{seed}"
    if os.path.exists(filename + ".zip"):
        sac = SAC.load(filename + ".zip", env=flat_env)
        print(f"loaded model {filename}")
        if as_gdads:
            flat_env.load(filename)
    else:
        sac = SAC("MlpPolicy",
                  env=flat_env,
                  verbose=1,
                  learning_rate=conf.lr,
                  tensorboard_log=filename,
                  buffer_size=conf.buffer_size,
                  batch_size=conf.batch_size,
                  gamma=gamma(conf.ep_len),
                  learning_starts=100 * conf.ep_len,
                  policy_kwargs=dict(log_std_init=-3,
                                     net_arch=[conf.layer_size] * 2),
                  seed=seed,
                  device="cuda",
                  train_freq=4)
    if do_train:
        train(model=sac, conf=conf, save_fname=filename, eval_env=eval_env)
    if do_render:
        show(model=sac, env=eval_env, conf=conf)
    do_eval = not do_train and not do_render
    if do_eval:
        results = ant_grid_evaluation(model=sac,
                                      env=eval_env,
                                      episode_len=conf.ep_len)
        dump_ant_grid_evaluation(results)
Exemplo n.º 17
0
def train():
    train_images, test_images = load_data("dataset")

    env = Monitor(
        PuzzleEnv(images=train_images,
                  img_size=IMG_SIZE,
                  channel_num=CHANNEL_NUM,
                  puzzle_size=(3, 3),
                  max_step_num=100,
                  puzzle_type="switch",
                  dist_type="manhattan",
                  penalty_for_step=-0.2,
                  reward_for_completiton=20,
                  positive_reward_coefficient=1.0,
                  obs_conf=OBS_CONF))

    policy_kwargs = dict(
        features_extractor_class=CustomCNN,
        features_extractor_kwargs=dict(features_dim=128),
    )

    model = PPO('CnnPolicy',
                env,
                policy_kwargs=policy_kwargs,
                verbose=1,
                learning_rate=0.0005,
                seed=42)
    model.learn(total_timesteps=1000000)

    test(model, test_images)
Exemplo n.º 18
0
def new_test():

    processed = pd.read_csv(
        os.path.abspath('./me/datasets/new_data_with_techs_turb.csv'),
        index_col=0)

    train = data_split(processed, '2009-01-01', '2018-01-01')
    trade = data_split(processed, '2018-01-01', '2021-01-01')

    stock_dimension = len(train.tic.unique())
    state_space = 1 + 2 * stock_dimension + len(
        config.TECHNICAL_INDICATORS_LIST) * stock_dimension
    print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}")

    env_kwargs = {
        "hmax": 100,
        "initial_amount": 1000000,
        "transaction_cost_pct": 0.001,
        "state_space": state_space,
        "stock_dim": stock_dimension,
        "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST,
        "action_space": stock_dimension,
        "reward_scaling": 1e-4
    }

    e_train_gym = StockTradingEnv(df=train, **env_kwargs)
    env_train, _ = e_train_gym.get_sb_env()

    log_dir = "me/tmp/"
    os.makedirs(log_dir, exist_ok=True)

    env_train.envs[0] = Monitor(env_train.envs[0], log_dir)

    agent = DRLAgent(env=env_train)
    model_a2c = agent.get_model("a2c", verbose=0)

    trained_a2c = agent.train_model(model=model_a2c,
                                    tb_log_name='a2c',
                                    total_timesteps=100000)

    data_turbulence = processed[(processed.date < '2018-01-01')
                                & (processed.date >= '2009-01-01')]
    insample_turbulence = data_turbulence.drop_duplicates(subset=['date'])
    turbulence_threshold = np.quantile(insample_turbulence.turbulence.values,
                                       1)

    e_trade_gym = StockTradingEnv(df=trade,
                                  turbulence_threshold=380,
                                  **env_kwargs)
    env_trade, obs_trade = e_trade_gym.get_sb_env()

    print("BEGIN PREDICTION")
    df_account_value, df_actions = DRLAgent.DRL_prediction(model=trained_a2c,
                                                           test_data=trade,
                                                           test_env=env_trade,
                                                           test_obs=obs_trade)

    print(df_account_value)

    print("END PREDICTION")
Exemplo n.º 19
0
def main(args):
    # 1. Start a W&B run
    wandb.init(project='pearl', entity='adlr-ss-21-05')
    wandb.config.update(args)
    print("wandb name: ", wandb.run.name)

    log_dir = "tmp/" + wandb.run.name + "/"
    os.makedirs(log_dir, exist_ok=True)

    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                check_log=1,
                                                log_dir=log_dir,
                                                model_name=wandb.run.name)

    env = gym.make('kuka_iiwa_insertion-v0',
                   use_gui=False,
                   steps_per_action=args.steps_per_action,
                   max_steps=args.max_steps,
                   action_step_size=args.action_step_size)
    env = Monitor(env, log_dir)

    model = SAC("MlpPolicy",
                env,
                verbose=args.verbosity,
                train_freq=(args.train_freq_num, args.train_freq_type),
                batch_size=args.batch_size)

    i = 0
    save_interval = 1000000
    while True:
        i += save_interval
        model.learn(total_timesteps=save_interval, callback=callback)
Exemplo n.º 20
0
    def train_model(self):

        auto_save_callback = SaveOnBestTrainingRewardCallback(
            log_dir=self.log_dir)
        auto_save_callback_every_1000_steps = EveryNTimesteps(
            n_steps=1000, callback=auto_save_callback)

        self.environment = Monitor(self.environment, self.log_dir)
        self.model = self.algorithm('MlpPolicy',
                                    self.environment,
                                    verbose=1,
                                    tensorboard_log=self.log_dir)

        name = self.model_name + "_full_model"
        checkpoint_callback = SavePerformanceOnCheckpoints(
            resource_manager=self,
            name=name,
            checkpoint_results=self.checkpoint_results)
        checkpoint_callback_every_1000_steps = EveryNTimesteps(
            n_steps=1000, callback=checkpoint_callback)

        with ProgressBarManager(self.training_steps) as progress_callback:
            self.model.learn(total_timesteps=self.training_steps,
                             callback=[
                                 progress_callback,
                                 auto_save_callback_every_1000_steps,
                                 checkpoint_callback_every_1000_steps
                             ])

        self.save_episode_rewards_as_csv()
        model_path = os.path.abspath("models/" + name)
        self.model.save(model_path)
Exemplo n.º 21
0
def main():
    # Instantiate the env
    env = Gaze(fitts_W=fitts_W,
               fitts_D=fitts_D,
               ocular_std=ocular_std,
               swapping_std=swapping_std)
    env = Monitor(env, log_dir)

    # Train the agent
    model = PPO('MlpPolicy', env, verbose=0, clip_range=0.15)
    '''
    # Save a checkpoint periodically
    save_feq_n=timesteps/10    
    checkpoint_callback = CheckpointCallback(save_freq=save_feq_n, save_path=f'{log_dir}savedmodel/',
        name_prefix='model_ppo')
    '''

    # Train the agent
    model.learn(total_timesteps=int(timesteps), callback=checkpoint_callback)

    # Save the model
    model.save(f'{log_dir}savedmodel/model_ppo')

    # Plot the learning curve
    plot_results2(log_dir)

    save_learned_behaviour()
 def make_eval_env(with_monitor, wrapper_class=gym.Wrapper):
     # Make eval environment with or without monitor in root,
     # and additionally wrapped with another wrapper (after Monitor).
     env = None
     if vec_env_class is None:
         # No vecenv, traditional env
         env = gym.make(env_id)
         if with_monitor:
             env = Monitor(env)
         env = wrapper_class(env)
     else:
         if with_monitor:
             env = vec_env_class([lambda: wrapper_class(Monitor(gym.make(env_id)))] * n_envs)
         else:
             env = vec_env_class([lambda: wrapper_class(gym.make(env_id))] * n_envs)
     return env
Exemplo n.º 23
0
        def _init():
            if isinstance(env_id, str):
                env = gym.make(env_id, **env_kwargs)
            else:
                env = env_id(**env_kwargs)
            if seed is not None:
                env.seed(seed + rank)
                env.action_space.seed(seed + rank)

            # Hide the score
            env = HideScore(env)

            # Wrap the env in a Monitor wrapper
            # to have additional training information
            monitor_path = os.path.join(
                monitor_dir, str(rank)) if monitor_dir is not None else None
            # Create the monitor folder if needed
            if monitor_path is not None:
                os.makedirs(monitor_dir, exist_ok=True)
            env = Monitor(env, filename=monitor_path)

            # Optionally, wrap the environment with the provided wrapper
            if wrapper_class is not None:
                env = wrapper_class(env)
            return env
Exemplo n.º 24
0
    def create(self, n_envs=1):
        """Create the agent"""
        self.env = self.agent_helper.env
        log_dir = self.agent_helper.config_dir
        os.makedirs(log_dir, exist_ok=True)
        self.env = Monitor(self.env, log_dir)
        #TODO:
        # Create DDPG policy and define its hyper parameter here! even the action space and observation space.
        # add policy
        policy_name = self.agent_helper.config['policy']
        self.policy = eval(policy_name)
        # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
        n_actions = int(self.agent_helper.env.action_space.shape[0])
        action_noise = NormalActionNoise(
            mean=np.zeros(n_actions),
            sigma=self.agent_helper.config['rand_sigma'] * np.ones(n_actions))

        #FIXME: test:
        # self.model = DDPG("MlpPolicy", self.env, action_noise=action_noise, verbose=1, tensorboard_log=self.agent_helper.graph_path)

        # TODO: fix the obvervation space and action space later. Test if the obervation space input is correct? Output action space is correct?
        # activ_function_name = self.agent_helper.config['nn_activ']
        # activ_function = eval(activ_function_name)

        # policy_kwargs = dict(activation_fn=activ_function,
        #              net_arch=[dict(pi=[32, 32], qf=[32, 32])])
        policy_kwargs = dict(net_arch=self.agent_helper.config['layers'])

        self.model = OffPolicyAlgorithm(
            self.policy,
            self.env,
            learning_rate=self.agent_helper.config['learning_rate'],
            buffer_size=self.agent_helper.config['buffer_size'],
            batch_size=self.agent_helper.config['batch_size'],
            tau=self.agent_helper.config['tau'],
            gamma=self.agent_helper.config['gamma'],
            gradient_steps=self.agent_helper.config['gradient_steps'],
            action_noise=action_noise,
            optimize_memory_usage=self.agent_helper.
            config['optimize_memory_usage'],
            create_eval_env=self.agent_helper.config['create_eval_env'],
            policy_kwargs=policy_kwargs,
            verbose=self.agent_helper.config['verbose'],
            learning_starts=self.agent_helper.config['learning_starts'],
            tensorboard_log=self.agent_helper.graph_path,
            seed=self.agent_helper.seed)
        pass
Exemplo n.º 25
0
def test(model, test_images):
    test_env = Monitor(
        PuzzleEnv(images=test_images,
                  img_size=IMG_SIZE,
                  channel_num=CHANNEL_NUM,
                  puzzle_size=(3, 3),
                  puzzle_type="switch",
                  dist_type="manhattan",
                  penalty_for_step=-0.2,
                  reward_for_completiton=20,
                  positive_reward_coefficient=1.0,
                  obs_conf=OBS_CONF))

    solutions = []
    rews = []
    steps = []
    sample = len(test_images)
    errors = 0

    for iter in range(sample):
        i = 0
        done = False
        obs = test_env.reset()
        frames = [obs]

        while not done:
            i += 1
            action, _states = model.predict(obs)
            obs, rewards, done, info = test_env.step(action)
            frames.append(obs)
            rews.append(rewards)

            if i == 10000:
                errors += 1
                break

        solutions.append(frames)
        done = False
        print(i, sum(rews), rews)
        rews = []
        steps.append(i)

    print('Average steps taken:  ', sum(steps) / sample)
    print('Median of steps taken: ', statistics.median(steps))
    print('Number of errors: ', errors)
    plt.hist(steps, bins=9)
    plt.savefig('fig.png')
def test_vec_monitor_warn():
    env = DummyVecEnv([lambda: Monitor(gym.make("CartPole-v1"))])
    # We should warn the user when the env is already wrapped with a Monitor wrapper
    with pytest.warns(UserWarning):
        VecMonitor(env)

    with pytest.warns(UserWarning):
        VecMonitor(VecNormalize(env))
Exemplo n.º 27
0
    def _thunk():
        if env_id.startswith("dm"):
            _, domain, task = env_id.split('.')
            env = dmc2gym.make(domain_name=domain, task_name=task)
            env = ClipAction(env)
        elif env_id.startswith("rrc"):
            _, ac_type, ac_wrapper = env_id.split('.')
            ts_relative, sa_relative = False, False
            scaled_ac, task_space = False, False
            if ac_wrapper.split('-')[0] == 'task':
                task_space = True
                ts_relative = ac_wrapper.split('-')[-1] == 'rel'
            elif ac_wrapper.split('-')[0] == 'scaled':
                scaled_ac = True
                sa_relative = ac_wrapper.split('-')[-1] == 'rel'
            env = rrc_utils.build_env_fn(
                    action_type=ac_type, initializer=None, scaled_ac=scaled_ac,
                    task_space=task_space, sa_relative=sa_relative,
                    ts_relative=ts_relative, goal_relative=True,
                    rew_fn='step')()
        else:
            env = gym.make(env_id)

        is_atari = hasattr(gym.envs, 'atari') and isinstance(
            env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        if is_atari:
            env = NoopResetEnv(env, noop_max=30)
            env = MaxAndSkipEnv(env, skip=4)

        env.seed(seed + rank)

        if str(env.__class__.__name__).find('TimeLimit') >= 0:
            env = TimeLimitMask(env)

        if log_dir is not None:
            env = Monitor(env,
                          os.path.join(log_dir, str(rank)),
                          allow_early_resets=allow_early_resets)

        if is_atari:
            if len(env.observation_space.shape) == 3:
                env = EpisodicLifeEnv(env)
                if "FIRE" in env.unwrapped.get_action_meanings():
                    env = FireResetEnv(env)
                env = WarpFrame(env, width=84, height=84)
                env = ClipRewardEnv(env)
        elif len(env.observation_space.shape) == 3:
            raise NotImplementedError(
                "CNN models work only for atari,\n"
                "please use a custom wrapper for a custom pixel input env.\n"
                "See wrap_deepmind for an example.")

        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
            env = TransposeImage(env, op=[2, 0, 1])

        return env
Exemplo n.º 28
0
 def _inner() -> gym.Env:
     env = gym.make(ENV_NAME, verbose=0)
     env.seed(seed)
     if not is_eval:
         env = Monitor(env, run_dir)
     env = GrayScaleObservation(env, keep_dim=True)
     if frame_skip > 0:
         env = MaxAndSkipEnv(env, skip=frame_skip)
     return env
Exemplo n.º 29
0
 def load(self, name: str, env, replace_parameters=None):
     self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace(
         ":", "-")
     os.makedirs(self.log_dir, exist_ok=True)
     monitor_env = Monitor(env, self.log_dir, allow_early_resets=True)
     vec_env = DummyVecEnv([lambda: monitor_env])
     self.model = PPO.load(name,
                           env=vec_env,
                           custom_objects=replace_parameters)
Exemplo n.º 30
0
    def _init():
        set_random_seed(seed + rank)
        env = gym.make(env_id, **env_kwargs)

        # Wrap first with a monitor (e.g. for Atari env where reward clipping is used)
        log_file = os.path.join(log_dir,
                                str(rank)) if log_dir is not None else None
        # Monitor success rate too for the real robot
        info_keywords = ('is_success', ) if 'NeckEnv' in env_id else ()
        env = Monitor(env, log_file, info_keywords=info_keywords)

        # Dict observation space is currently not supported.
        # https://github.com/hill-a/stable-baselines/issues/321
        # We allow a Gym env wrapper (a subclass of gym.Wrapper)
        if wrapper_class:
            env = wrapper_class(env)

        env.seed(seed + rank)
        return env