Exemplo n.º 1
0
    def objective(trial):
        # copy to preserve original params
        _params = params.copy()
        _params['hyper_params'] = HYPERPARAMS_SAMPLER[args.algorithm.lower()](trial)

        # network architecture
        net_arch = trial.suggest_categorical('net_arch', ['8x8', '16x16', '32x32'])
        layers = map(int, net_arch.split('x'))
        policy_kwargs = dict(act_fun=tf.nn.relu, net_arch=list(layers))

        print(f'*** beginning trial {trial.number}')
        print('\thyper-parameters:')
        for param, value in _params['hyper_params'].items():
            print(f'\t\t{param}:{value}')
        print(f'\t\tnet_arch: {net_arch}')

        _params['save_dir'] = _params['save_dir'] / 'optimizer'

        try:
            # purge any previously saved models
            purge_model(_params, args, interactive=False)

            ######################################################
            # learning phase - on possibly multiple environments #
            ######################################################
            godot_instances = [GodotInstance(o_port, a_port) for o_port, a_port in
                               get_godot_instances(args.n_godot_instances)]
            env = create_env(args, env_id, godot_instances, _params, session_path)
            env = VecCheckNan(env, warn_once=False, raise_exception=True)

            # learn and save model
            model = init_model(session_path, _params, env, args, policy_kwargs=policy_kwargs)
            learn(env, model, _params, args, session_path)
            env.close()

            ##########################################################################
            # evaluation phase - single environment (deterministic action selection) #
            ##########################################################################
            env = create_env(args, env_id, [GODOT_EVAL_INSTANCE], _params, session_path, eval=True)
            env = VecCheckNan(env, warn_once=False, raise_exception=True)

            # loaded previously learned model and evaluate
            model = init_model(session_path, _params, env, args, eval=True)
            mean_reward, _ = evaluate(model, env, args, n_episodes=n_episodes_per_eval)
            env.close()

        except (AssertionError, ValueError) as e:
            print(f'pruning optimizer trial {trial} due to exception {e}')
            raise optuna.exceptions.TrialPruned()

        # optuna minimizes the objective by default, so we need to flip the sign to maximize
        cost = -1 * mean_reward
        return cost
Exemplo n.º 2
0
    def _init_environment(self,datapath,window_size):

        df = pd.read_csv(datapath)
        bid_price_columns = [i for i in range(1,len(df.columns),20)]
        print(bid_price_columns)
        ask_price_columns = [i for i in range(3,len(df.columns),20)]
        bidPrices = df[df.columns[bid_price_columns]]
        askPrices = df[df.columns[bid_price_columns]]
        df_concat = pd.concat([bidPrices, askPrices])
        midPrices = df_concat.groupby(df_concat.index).mean().transpose().values[-len(self.securities):]
        print(midPrices[:,0])

        self.env = DummyVecEnv([lambda: securities_trading_env(np.array(midPrices).T)])
        self.env = VecCheckNan(self.env, raise_exception=True)

        n_actions = self.env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
        print(n_actions)

        if(self.policy == "DDPG"):
           self.model = DDPG(ddpgMlpPolicy, self.env, verbose=int(self.verbose), param_noise=param_noise, action_noise= action_noise)
        elif(self.policy=="TD3"):
            self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose))
        elif(self.policy=="GAIL"):
            self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose))
        else:
            self.model = PPO2(MlpLnLstmPolicy, self.env, verbose=int(self.verbose))

        if self.load: #load model
            self.model = self.model.load("save/"+modelpath+".h5")

        #init model class
        self.gym_model = Agent(market_event_securities, market_event_queue, securities, queue, host, policy,strategy, cash_balance,self.model,self.env,window_size,self.inventory)
Exemplo n.º 3
0
def test_check_nan():
    """Test VecCheckNan Object"""

    env = DummyVecEnv([NanAndInfEnv])
    env = VecCheckNan(env, raise_exception=True)

    env.step([[0]])

    try:
        env.step([[float('NaN')]])
    except ValueError:
        pass
    else:
        assert False

    try:
        env.step([[float('inf')]])
    except ValueError:
        pass
    else:
        assert False

    try:
        env.step([[-1]])
    except ValueError:
        pass
    else:
        assert False

    try:
        env.step([[1]])
    except ValueError:
        pass
    else:
        assert False
Exemplo n.º 4
0
def main():
    save_path = args.checkpoint_dir + args.policy + "/" + args.policy
    env = gym.make("SegmentationEnv-v0", 
        objs_dir=args.objs_dir, 
        max_scenes=args.max_scenes,
        sample_size=args.sample_size,
        diff_punishment=args.diff_punishment,
        max_steps_per_scene=args.max_steps_per_scene,
        scene_mode=args.scene_mode,
        training=False,
        point_mode=args.point_mode,
        voxel_size=args.voxel_size,
        voxel_mode=args.voxel_mode,
		single_scenes=args.single_scenes,
        early_diff=args.early_diff)
        
    env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run
    env = VecCheckNan(env, raise_exception=True)
    
    model = PPO2.load(save_path, env=env)
    
    n_episodes = 10
    for i in range(n_episodes):
        total_reward = 0
        obs = env.reset()
        while True:
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            total_reward += reward
            if done: 
                print("Total Reward: ", total_reward)
                break
    
    env.close()
Exemplo n.º 5
0
    def make_env(n_envs=1,
                 normalize=True,
                 multiprocess=False,
                 log_dir_env=None):
        """
        Initializes an OpenAI Gym environment for training and evaluation.

        :param n_envs: Number of parallel environments to initialize
        :param normalize: Normalization of state values
        :param multiprocess: Use multi processing with the SubprocVecEnv instead of DummyVecEnv. Not recommended.
        :param log_dir_env: Parent directory of the environments log directory
        :return:
        """
        def init_env(log_dir_env):
            if log_dir_env is None:
                log_dir_env = os.path.join(log_dir, "env_direct")
                os.makedirs(log_dir_env, exist_ok=True)
            log_dir_single = os.path.join(log_dir_env, str(uuid.uuid4()))
            env = gym.make('AtcEnv-v0')
            env = TimeLimit(env, 8000)
            os.makedirs(log_dir_single, exist_ok=True)
            env = Monitor(env, log_dir_single, allow_early_resets=True)
            return env

        if multiprocess:
            env = SubprocVecEnv(
                [lambda: init_env(log_dir_env) for i in range(n_envs)])
        else:
            env = DummyVecEnv(
                [lambda: init_env(log_dir_env) for i in range(n_envs)])

        env = VecCheckNan(env, raise_exception=True)

        if normalize:
            env = VecNormalize(env,
                               norm_obs=True,
                               norm_reward=False,
                               clip_obs=10.)

        return env
Exemplo n.º 6
0
def _check_nan(env: gym.Env) -> None:
    """Check for Inf and NaN using the VecWrapper."""
    vec_env = VecCheckNan(DummyVecEnv([lambda: env]))
    for _ in range(10):
        action = [env.action_space.sample()]
        _, _, _, _ = vec_env.step(action)
Exemplo n.º 7
0
            ghz_state = StandardGHZStateQ(N)
        else:
            ghz_state = StandardGHZState(N)
        env = QubitEnv(N=N,
                       V=C6,
                       geometry=geometry,
                       t_list=np.linspace(0, t, t_num),
                       Omega_range=OMEGA_RANGE,
                       Delta_range=DELTA_RANGE,
                       ghz_state=ghz_state,
                       verbose=ENV_VERBOSE)
        return env

    generating_envs_start_time = time.time()
    env = EnvType([lambda: make_gym_env() for i in range(n_envs)])
    env = VecCheckNan(env, raise_exception=True)
    generating_envs_end_time = time.time()
    print(
        f"Generated {n_envs} envs in {generating_envs_end_time - generating_envs_start_time:.3f}s"
    )

    model = PPO2(
        MlpLstmPolicy,
        env,
        learning_rate=LEARNING_RATE,
        verbose=1,
        nminibatches=1,
        n_steps=t_num,
        tensorboard_log='./tensorboard_logs',
        # ent_coef=0.05
    )
Exemplo n.º 8
0
def main():
    global save_path, log_dir, model, best_mean_reward
    mk_dir(args.checkpoint_dir + args.policy)
    save_path = args.checkpoint_dir + args.policy + "/" + args.policy
    log_dir = args.summary_dir + args.policy
    mk_dir(log_dir)
    env = gym.make("SegmentationEnv-v0",
                   objs_dir=args.objs_dir,
                   max_scenes=args.max_scenes,
                   sample_size=args.sample_size,
                   diff_punishment=args.diff_punishment,
                   max_steps_per_scene=args.max_steps_per_scene,
                   scene_mode=args.scene_mode,
                   point_mode=args.point_mode,
                   voxel_size=args.voxel_size,
                   voxel_mode=args.voxel_mode,
                   single_scenes=args.single_scenes,
                   early_diff=args.early_diff,
                   wall_weight=args.wall_weight)
    env = Monitor(env, log_dir, allow_early_resets=True)

    env = DummyVecEnv([
        lambda: env
    ])  # The algorithms require a vectorized environment to run
    env = VecCheckNan(env, raise_exception=True)

    net_module = importlib.import_module(args.policy)
    model = PPO2(net_module.Policy,
                 env,
                 verbose=args.verbose,
                 tensorboard_log=log_dir,
                 learning_rate=args.learning_rate,
                 ent_coef=args.ent_coef,
                 cliprange=args.cliprange,
                 cliprange_vf=args.cliprange_vf,
                 lam=args.lam,
                 gamma=args.gamma,
                 seed=args.seed,
                 n_cpu_tf_sess=args.n_cpu_tf_sess,
                 noptepochs=args.noptepochs,
                 nminibatches=args.nminibatches,
                 n_steps=args.n_steps,
                 max_grad_norm=args.max_grad_norm)

    if os.path.isfile("expert_trajectories.npz") and args.pretrain == 1:
        print("------------start pretrain------------")
        #dataset = ExpertDataset(expert_path="expert_trajectories.npz", special_shape=True, traj_limitation=100, batch_size=16)
        dataset = ExpertDataset(expert_path="expert_trajectories.npz",
                                special_shape=True,
                                train_fraction=args.train_fraction,
                                batch_size=args.pretrain_batch_size)
        #model.pretrain(dataset, learning_rate=0.001, n_epochs=1000)
        model = model.pretrain(dataset,
                               val_interval=1,
                               learning_rate=args.pretrain_learning_rate,
                               n_epochs=args.pretrain_n_epochs)
        print("pretrain finished -- save model")
        model.save(save_path)
        returns = []

        print("Calculate mean reward")
        n_episodes = 10
        for i in range(n_episodes):
            total_reward = 0
            obs = env.reset()
            while True:
                action, _states = model.predict(obs, deterministic=True)
                obs, reward, done, info = env.step(action)
                total_reward += reward
                if done:
                    returns.append(total_reward)
                    break
        returns = np.array(returns)
        best_mean_reward = np.mean(returns)
        print("Best mean reward: {:.2f}".format(best_mean_reward))

    model.learn(total_timesteps=args.total_timesteps, callback=callback)
    env.close()
Exemplo n.º 9
0
MAX_FOR_BLOCKS = 230
botToUse = sys.argv[1]
# oldLog = sys.argv[2]
oldLog = "{\"log\":[{\"PayloadRolled\":{\"from\":1,\"block\":1}},{\"PayloadPlaced\":{\"from\":1,\"block\":1,\"orientation\":0,\"x\":0,\"y\":0}},{\"PayloadRolled\":{\"from\":0,\"block\":1}},{\"PayloadPlaced\":{\"from\":0,\"block\":1,\"orientation\":0,\"x\":8,\"y\":0}},{\"PayloadRolled\":{\"from\":1,\"block\":1}},{\"PayloadPlaced\":{\"from\":1,\"block\":1,\"orientation\":0,\"x\":2,\"y\":0}},{\"PayloadRolled\":{\"from\":0,\"block\":0}},{\"PayloadPlaced\":{\"from\":0,\"block\":0,\"orientation\":1,\"x\":3,\"y\":2}},{\"PayloadRolled\":{\"from\":1,\"block\":0}},{\"PayloadPlaced\":{\"from\":1,\"block\":0,\"orientation\":1,\"x\":3,\"y\":6}},{\"PayloadRolled\":{\"from\":0,\"block\":2}},{\"PayloadConsidering\":{\"play_index\":0}}]}"

is_box_space = True
dirname = "D:\\4-System\\rusty\\"
filename = "50000_heutistic_pretrain_"
filename += "box" if is_box_space else "discrete"
np.seterr(all='raise')

origEnv = gym.make("rustybox-v0" if is_box_space else "rustydiscrete-v0")

origEnv.max_invalid_tries = 7
env = VecCheckNan(DummyVecEnv([lambda: origEnv]))

# Instantiate the agent
model = PPO2.load("models/ppo2boxbestparam/2e4-30.pkl", env=env)
# model.load("models/pretrain/"+filename)

rustLib.field_restore_log(origEnv.field, oldLog.encode('utf-8'))
obs = field_to_array(origEnv.field)
actions, _states = model.predict(obs)
if is_box_space:
    action = 0
    current_max = -1
    for i in range(0, len(actions)):
        action_probability = actions[i]
        newIndex = MAX_FOR_BLOCKS + i
        action_possible = obs[int(newIndex / 10)][newIndex % 10] == 1
Exemplo n.º 10
0
# Load Dataset
stocks_df = dth.load_data(config.data_path)

# make train, val, test df
train, val, test = dth.train_val_test_split(stocks_df)

# Training Env
train_env = DummyVecEnv([
    lambda: valueTradingEnv(df=train,
                            sample=config.trainsampling,
                            episodic=config.episodic,
                            yearrange=config.yearrange,
                            save_path=config.env_path.joinpath("train"))
    for i in range(config.num_envs)
])
train_env = VecCheckNan(train_env, raise_exception=True)

# Validation Env
val_env = DummyVecEnv([
    lambda: valueTradingEnv(df=val,
                            sample=False,
                            episodic=config.episodic,
                            yearrange=config.yearrange,
                            save_path=config.env_path.joinpath("val"))
    for i in range(config.num_envs)
])
val_env = VecCheckNan(val_env, raise_exception=True)

# test_env
test_env = DummyVecEnv([
    lambda: valueTradingEnv(df=test,
Exemplo n.º 11
0
    def single_run(self,
                   folder_path,
                   num_evals,
                   policy_kwargs=None,
                   is_baseline=False,
                   baseline_policy=None):
        # initialize cProfile
        profiler_object = cProfile.Profile()
        profiler_object.enable()

        config = configparser.ConfigParser()
        config.read('gym_config/config.ini')

        rl_time_steps = config.getint('rl', 'time_steps')
        ent_coef = config.getfloat('rl', 'ent_coef')
        n_steps = config.getint('rl', 'n_steps')
        nminibatches = config.getint('rl', 'nminibatches')
        noptepochs = config.getint('rl', 'noptepochs')
        learning_rate = config.getfloat('rl', 'learning_rate')
        time_steps = config.getint('garden', 'time_steps')
        step = config.getint('garden', 'step')
        num_plants_per_type = config.getint('garden', 'num_plants_per_type')
        num_plant_types = config.getint('garden', 'num_plant_types')
        garden_x = config.getint('garden', 'X')
        garden_y = config.getint('garden', 'Y')
        garden_z = 2 * config.getint(
            'garden', 'num_plant_types'
        ) + 1  # Z axis contains a matrix for every plant type plus one for water levels.
        sector_width = config.getint('garden', 'sector_width')
        sector_height = config.getint('garden', 'sector_height')
        action_low = config.getfloat('action', 'low')
        action_high = config.getfloat('action', 'high')
        obs_low = config.getint('obs', 'low')
        obs_high = config.getint('obs', 'high')

        env = gym.make(
            'simalphagarden-v0',
            wrapper_env=SimAlphaGardenWrapper(time_steps,
                                              garden_x,
                                              garden_y,
                                              sector_width,
                                              sector_height,
                                              num_plant_types,
                                              num_plants_per_type,
                                              step=step),
            garden_x=garden_x,
            garden_y=garden_y,
            garden_z=garden_z,
            sector_width=sector_width,
            sector_height=sector_height,
            action_low=action_low,
            action_high=action_high,
            obs_low=obs_low,
            obs_high=obs_high,
        )
        env = DummyVecEnv([lambda: env])
        # TODO: Normalize input features? VecNormalize
        env = VecCheckNan(env, raise_exception=False)

        if is_baseline:
            copyfile('gym_config/config.ini', folder_path + '/config.ini')

            # Evaluate baseline on 50 random environments of same parameters.
            self.evaluate_policy(folder_path,
                                 num_evals,
                                 env,
                                 garden_x,
                                 garden_y,
                                 sector_width,
                                 sector_height,
                                 is_baseline=True,
                                 baseline_policy=baseline_policy,
                                 step=1)

            # Graph evaluations
            self.graph_utils.graph_evaluations(folder_path, garden_x, garden_y,
                                               time_steps, step, num_evals,
                                               num_plant_types)
        else:
            pathlib.Path(folder_path + '/ppo_v2_tensorboard').mkdir(
                parents=True, exist_ok=True)
            # Instantiate the agent
            model = PPO2(CustomCnnPolicy,
                         env,
                         policy_kwargs=policy_kwargs,
                         ent_coef=ent_coef,
                         n_steps=n_steps,
                         nminibatches=nminibatches,
                         noptepochs=noptepochs,
                         learning_rate=learning_rate,
                         verbose=1,
                         tensorboard_log=folder_path + '/ppo_v2_tensorboard/')

            # model = PPO2(MlpPolicy, env, ent_coef=ent_coef, n_steps=n_steps, nminibatches=nminibatches, noptepochs=noptepochs, learning_rate=learning_rate, verbose=1, tensorboard_log=folder_path + '/ppo_v2_tensorboard/')
            # Train the agent
            model.learn(
                total_timesteps=rl_time_steps
            )  # this will crash explaining that the invalid value originated from the env

            model.save(folder_path + '/model')

            copyfile('gym_config/config.ini', folder_path + '/config.ini')

            # Evaluate model on 50 random environments of same parameters.
            self.evaluate_policy(folder_path,
                                 num_evals,
                                 env,
                                 garden_x,
                                 garden_y,
                                 sector_width,
                                 sector_height,
                                 is_baseline=False)

            # Graph evaluations
            # self.graph_utils.graph_evaluations(folder_path, garden_x, garden_y, time_steps, step, num_evals, num_plant_types)

        profiler_object.disable()

        # dump the profiler stats
        s = io.StringIO()
        ps = pstats.Stats(profiler_object, stream=s).sort_stats('cumulative')
        pathlib.Path(folder_path + '/Timings').mkdir(parents=True,
                                                     exist_ok=True)
        ps.dump_stats(folder_path + '/Timings/dump.txt')

        # convert to human readable format
        out_stream = open(folder_path + '/Timings/time.txt', 'w')
        ps = pstats.Stats(folder_path + '/Timings/dump.txt', stream=out_stream)
        ps.strip_dirs().sort_stats('cumulative').print_stats()