def __init__(self, config, prior, full_posterior=False, load_model=True, posterior_samples=25, verbose=True): self.n_regions = config['n_regions'] self.n_products = config['n_products'] self.n_temporal_features = config['n_temporal_features'] self.adj_mtx = config['adj_mtx'] self.model_type = config['model_type'] self.prior = prior self.full_posterior = full_posterior self.env_model = None self.trace = None self.posterior_samples = posterior_samples self.max_rollouts = 3 # 90 day rollouts self.sales = [] self.seed() self.viewer = None self.state = None self.n_actions = 1 + self.n_regions * self.n_products * 2 self.cost = config["cost"] self.log_linear = config["log_linear"] self.verbose = verbose self._load_data(config['model_path'], config['train_data'], load_model) self.sample_index = np.arange(self.feature_shape[0]) self.cnt_reward_not_reduce_round = 0 self.time_step_cntr = 0 self.max_cnt_reward_not_reduce_round = self.metadata[ 'max_cnt_reward_not_reduce_round'] observation_shape = list(self.feature_shape) observation_shape[-1] = observation_shape[-1] + 1 observation_shape = tuple(observation_shape) self.action_space = spaces.Discrete(self.n_actions) self.observation_space = spaces.Dict({ "day_vec": gym.spaces.MultiBinary(7), "board_config": spaces.Box(low=-2, high=1, shape=(self.n_regions, self.n_products), dtype=np.int8), "prev_sales": spaces.Box(low=0, high=5, shape=(1, 6), dtype=np.int8) }) self.action_map = self.build_action_map() self.sales_quantiles = self.get_sales_quantiles( get_store_id(config["train_data"]))
def main(args): store_id = get_store_id(cfg.vals["train_data"]) hyp = { "epochs": args.epochs, "rollout batch size": args.rollout_batch_size, "parameter updates": args.epochs * args.rollout_batch_size, "rollouts": args.rollouts, "lambda": args.lmbda, "batch size": args.batch_size, "posterior samples": args.posterior_samples, "episode length": cfg.vals["episode_len"], "n simulations": args.eval_eps, "store": store_id, "eps": args.eps } logger = Logger(hyp, "./results/", "pc_mopo") logger.write() prior = Prior(config=cfg.vals) env_model = AllocationEnv(config=cfg.vals, prior=prior, load_model=True, full_posterior=True, posterior_samples=args.posterior_samples, verbose=False) policy = DQN(MlpPolicy, env_model, batch_size=args.batch_size) mopo_dqn = Mopo( policy=policy, env_model=env_model, rollout_batch_size=args.rollout_batch_size, epochs=args.epochs, rollout=args.rollouts, n_actions=env_model.n_actions, lmbda=args.lmbda, buffer_path=f"../data/{store_id}-buffer-d-trn.p", # buffer_path=None eps=args.eps) mopo_dqn.learn() if os.path.exists(f"./models/{store_id}-{args.file_name}"): os.remove(f"./models/{store_id}-{args.file_name}") mopo_dqn.policy.save(f"./models/{store_id}-{args.file_name}")
def main(): dir = "../data/" files = ["store-1-train.csv", "store-2-train.csv"] for f in files: store_id = get_store_id(f) data = pd.read_csv(dir + f) sales = data[["date", "sales"]].groupby("date").sum() quantiles = np.quantile(sales["sales"].values, [.2, .4, .6, .8, 1.0]) with open(dir + f"{store_id}-quantiles.txt", "w") as fp: for q in quantiles: fp.write("{:.4f}\n".format(q))
def main(args): store_id = get_store_id(cfg.vals["train_data"]) hyp = { "learning timesteps": args.epochs, "episode length": cfg.vals["episode_len"], "store": store_id } logger = Logger(hyp, "./results/", "off_policy_dqn") with open(f"../data/{store_id}-buffer-d-trn.p", 'rb') as f: buffer_env = pickle.load(f) simulator = get_simple_simulator(cfg.vals) model = DQN(MlpPolicy, simulator, verbose=2) model.learn_off_policy(total_timesteps=args.epochs, buffer=buffer_env) model.save(f"./models/{store_id}-{args.file_name}") logger.write()
import os import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) import config.config as cfg import numpy as np import argparse from evaluators.rejection_sampler import PSRS, MLPPolicy, BCQPolicy from evaluators.mlp_env_model import MLPClassifer from utils import get_store_id, get_action_space from policies.deepq.dqn import DQN from offpolicy.bcq import BCQ store_id = get_store_id(cfg.vals["train_data"]) buffer_path = f"../data/{store_id}-buffer-d-test.p" def eval_dqn(policy, args): action_space = get_action_space(cfg.vals["n_regions"], cfg.vals["n_products"]) model_path = f"../data/{store_id}-env_policy.pt" env_policy = MLPPolicy(action_space, buffer_path, model_path) psrs = PSRS(buffer_path, policy, env_policy, action_space, cfg.vals["n_regions"], cfg.vals["n_products"], args.epochs) r = psrs.evaluate() mean = np.mean(r) sigma = np.std(r)
def main(args): store_id = get_store_id(cfg.vals["train_data"]) policy = DQN.load(f"./models/{store_id}-off-policy-dqn.p") simulator = get_simple_simulator(cfg.vals) reward, sigma = evaluate_policy(policy, simulator, args.eval_eps)
def main(args): store_id = get_store_id(cfg.vals["train_data"]) hyp = { "episode length": cfg.vals["episode_len"], "n simulations": args.eval_eps, "store": store_id, "iterations:": args.iterations, "batch size": args.batch_size, "discount": args.discount, "tau": args.tau, "actor_lr": args.actor_lr, "critic_lr": args.critic_lr, "vae_lr": args.vae_lr, "actor_hs": args.actor_hs, "critic_hs": args.critic_hs, "dqda_clip": args.dqda_clip, "clip_norm": args.clip_norm } logger = Logger(hyp, "./results/", "bcq") if args.actor_hs <= 0: actor_hs_list = [64, 64] else: actor_hs_list = [args.actor_hs] * 2 if args.critic_hs <= 0: critic_hs_list = [64, 64] else: critic_hs_list = [args.critic_hs] * 2 file_name = "BCQ_%s_%s" % (args.env_name, str(args.seed)) buffer_name = "%s_%s_%s" % (args.buffer_type, args.env_name, str( args.seed)) print("---------------------------------------") print("Settings: " + file_name) print("---------------------------------------") if not os.path.exists("./results"): os.makedirs("./results") #prior = Prior(config=cfg.vals) #env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True) env = get_simple_simulator(cfg.vals) n_actions = env.n_actions #env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run tf.set_random_seed(args.seed) np.random.seed(args.seed) state_dim = observation_input(env.observation_space, batch_size=None, name='Ob', scale=False, reuse=tf.AUTO_REUSE)[0].shape[1].value action_dim = n_actions with tf.Session() as sess: # Initialize policy policy = BCQ(state_dim, action_dim, sess, args.tau, actor_hs=actor_hs_list, actor_lr=args.actor_lr, critic_hs=critic_hs_list, critic_lr=args.critic_lr, dqda_clipping=args.dqda_clip, clip_norm=bool(args.clip_norm), vae_lr=args.vae_lr) # Load buffer with open(f"../data/{store_id}-buffer-d-trn.p", 'rb') as f: replay_buffer = pickle.load(f) #evaluations = [] #episode_num = 0 #done = True stats_loss = policy.train(replay_buffer, iterations=args.iterations, batch_size=args.batch_size, discount=args.discount) print("Training iterations: " + str(args.iterations)) # print(stats_loss) # Save final policy if os.path.exists(f"./models/{store_id}-{args.file_name}"): os.remove(f"./models/{store_id}-{args.file_name}") policy.save(f"{store_id}-{args.file_name}", directory="./models") logger.write()