예제 #1
0
    def __init__(self,
                 config,
                 prior,
                 full_posterior=False,
                 load_model=True,
                 posterior_samples=25,
                 verbose=True):
        self.n_regions = config['n_regions']
        self.n_products = config['n_products']
        self.n_temporal_features = config['n_temporal_features']
        self.adj_mtx = config['adj_mtx']
        self.model_type = config['model_type']
        self.prior = prior
        self.full_posterior = full_posterior
        self.env_model = None
        self.trace = None
        self.posterior_samples = posterior_samples
        self.max_rollouts = 3  # 90 day rollouts
        self.sales = []
        self.seed()
        self.viewer = None
        self.state = None
        self.n_actions = 1 + self.n_regions * self.n_products * 2
        self.cost = config["cost"]
        self.log_linear = config["log_linear"]
        self.verbose = verbose

        self._load_data(config['model_path'], config['train_data'], load_model)
        self.sample_index = np.arange(self.feature_shape[0])

        self.cnt_reward_not_reduce_round = 0
        self.time_step_cntr = 0
        self.max_cnt_reward_not_reduce_round = self.metadata[
            'max_cnt_reward_not_reduce_round']

        observation_shape = list(self.feature_shape)
        observation_shape[-1] = observation_shape[-1] + 1
        observation_shape = tuple(observation_shape)

        self.action_space = spaces.Discrete(self.n_actions)
        self.observation_space = spaces.Dict({
            "day_vec":
            gym.spaces.MultiBinary(7),
            "board_config":
            spaces.Box(low=-2,
                       high=1,
                       shape=(self.n_regions, self.n_products),
                       dtype=np.int8),
            "prev_sales":
            spaces.Box(low=0, high=5, shape=(1, 6), dtype=np.int8)
        })
        self.action_map = self.build_action_map()
        self.sales_quantiles = self.get_sales_quantiles(
            get_store_id(config["train_data"]))
예제 #2
0
def main(args):

    store_id = get_store_id(cfg.vals["train_data"])
    hyp = {
        "epochs": args.epochs,
        "rollout batch size": args.rollout_batch_size,
        "parameter updates": args.epochs * args.rollout_batch_size,
        "rollouts": args.rollouts,
        "lambda": args.lmbda,
        "batch size": args.batch_size,
        "posterior samples": args.posterior_samples,
        "episode length": cfg.vals["episode_len"],
        "n simulations": args.eval_eps,
        "store": store_id,
        "eps": args.eps
    }

    logger = Logger(hyp, "./results/", "pc_mopo")
    logger.write()

    prior = Prior(config=cfg.vals)
    env_model = AllocationEnv(config=cfg.vals,
                              prior=prior,
                              load_model=True,
                              full_posterior=True,
                              posterior_samples=args.posterior_samples,
                              verbose=False)

    policy = DQN(MlpPolicy, env_model, batch_size=args.batch_size)

    mopo_dqn = Mopo(
        policy=policy,
        env_model=env_model,
        rollout_batch_size=args.rollout_batch_size,
        epochs=args.epochs,
        rollout=args.rollouts,
        n_actions=env_model.n_actions,
        lmbda=args.lmbda,
        buffer_path=f"../data/{store_id}-buffer-d-trn.p",
        # buffer_path=None
        eps=args.eps)

    mopo_dqn.learn()

    if os.path.exists(f"./models/{store_id}-{args.file_name}"):
        os.remove(f"./models/{store_id}-{args.file_name}")
    mopo_dqn.policy.save(f"./models/{store_id}-{args.file_name}")
예제 #3
0
def main():
    dir = "../data/"
    files = ["store-1-train.csv", "store-2-train.csv"]

    for f in files:
        store_id = get_store_id(f)

        data = pd.read_csv(dir + f)

        sales = data[["date", "sales"]].groupby("date").sum()

        quantiles = np.quantile(sales["sales"].values, [.2, .4, .6, .8, 1.0])

        with open(dir + f"{store_id}-quantiles.txt", "w") as fp:

            for q in quantiles:
                fp.write("{:.4f}\n".format(q))
def main(args):
    store_id = get_store_id(cfg.vals["train_data"])
    hyp = {
        "learning timesteps": args.epochs,
        "episode length": cfg.vals["episode_len"],
        "store": store_id
    }

    logger = Logger(hyp, "./results/", "off_policy_dqn")

    with open(f"../data/{store_id}-buffer-d-trn.p", 'rb') as f:
        buffer_env = pickle.load(f)

    simulator = get_simple_simulator(cfg.vals)
    model = DQN(MlpPolicy, simulator, verbose=2)
    model.learn_off_policy(total_timesteps=args.epochs, buffer=buffer_env)
    model.save(f"./models/{store_id}-{args.file_name}")

    logger.write()
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import config.config as cfg
import numpy as np
import argparse

from evaluators.rejection_sampler import PSRS, MLPPolicy, BCQPolicy
from evaluators.mlp_env_model import MLPClassifer
from utils import get_store_id, get_action_space

from policies.deepq.dqn import DQN
from offpolicy.bcq import BCQ

store_id = get_store_id(cfg.vals["train_data"])
buffer_path = f"../data/{store_id}-buffer-d-test.p"


def eval_dqn(policy, args):

    action_space = get_action_space(cfg.vals["n_regions"],
                                    cfg.vals["n_products"])
    model_path = f"../data/{store_id}-env_policy.pt"

    env_policy = MLPPolicy(action_space, buffer_path, model_path)

    psrs = PSRS(buffer_path, policy, env_policy, action_space,
                cfg.vals["n_regions"], cfg.vals["n_products"], args.epochs)
    r = psrs.evaluate()
    mean = np.mean(r)
    sigma = np.std(r)
def main(args):
    store_id = get_store_id(cfg.vals["train_data"])

    policy = DQN.load(f"./models/{store_id}-off-policy-dqn.p")
    simulator = get_simple_simulator(cfg.vals)
    reward, sigma = evaluate_policy(policy, simulator, args.eval_eps)
예제 #7
0
def main(args):
    store_id = get_store_id(cfg.vals["train_data"])
    hyp = {
        "episode length": cfg.vals["episode_len"],
        "n simulations": args.eval_eps,
        "store": store_id,
        "iterations:": args.iterations,
        "batch size": args.batch_size,
        "discount": args.discount,
        "tau": args.tau,
        "actor_lr": args.actor_lr,
        "critic_lr": args.critic_lr,
        "vae_lr": args.vae_lr,
        "actor_hs": args.actor_hs,
        "critic_hs": args.critic_hs,
        "dqda_clip": args.dqda_clip,
        "clip_norm": args.clip_norm
    }

    logger = Logger(hyp, "./results/", "bcq")

    if args.actor_hs <= 0:
        actor_hs_list = [64, 64]
    else:
        actor_hs_list = [args.actor_hs] * 2
    if args.critic_hs <= 0:
        critic_hs_list = [64, 64]
    else:
        critic_hs_list = [args.critic_hs] * 2

    file_name = "BCQ_%s_%s" % (args.env_name, str(args.seed))
    buffer_name = "%s_%s_%s" % (args.buffer_type, args.env_name, str(
        args.seed))
    print("---------------------------------------")
    print("Settings: " + file_name)
    print("---------------------------------------")

    if not os.path.exists("./results"):
        os.makedirs("./results")

    #prior = Prior(config=cfg.vals)
    #env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True)
    env = get_simple_simulator(cfg.vals)
    n_actions = env.n_actions
    #env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run

    tf.set_random_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = observation_input(env.observation_space,
                                  batch_size=None,
                                  name='Ob',
                                  scale=False,
                                  reuse=tf.AUTO_REUSE)[0].shape[1].value
    action_dim = n_actions

    with tf.Session() as sess:
        # Initialize policy
        policy = BCQ(state_dim,
                     action_dim,
                     sess,
                     args.tau,
                     actor_hs=actor_hs_list,
                     actor_lr=args.actor_lr,
                     critic_hs=critic_hs_list,
                     critic_lr=args.critic_lr,
                     dqda_clipping=args.dqda_clip,
                     clip_norm=bool(args.clip_norm),
                     vae_lr=args.vae_lr)

        # Load buffer
        with open(f"../data/{store_id}-buffer-d-trn.p", 'rb') as f:
            replay_buffer = pickle.load(f)

        #evaluations = []

        #episode_num = 0
        #done = True

        stats_loss = policy.train(replay_buffer,
                                  iterations=args.iterations,
                                  batch_size=args.batch_size,
                                  discount=args.discount)

        print("Training iterations: " + str(args.iterations))
        # print(stats_loss)

        # Save final policy
        if os.path.exists(f"./models/{store_id}-{args.file_name}"):
            os.remove(f"./models/{store_id}-{args.file_name}")
        policy.save(f"{store_id}-{args.file_name}", directory="./models")

        logger.write()