verbose=2, learning_starts=LEARNING_START, gamma=.2, exploration_fraction=0.35, exploration_final_eps=0.2) model.learn(total_timesteps=TIME_STEPS, learning_curve=False, test_t=TEST_T) with open(f"../data/{store_id}-buffer-d-test.p", 'wb') as f: pickle.dump(model.replay_buffer, f) results = {'rewards': [0.0]} buffer = ReplayBuffer(size=50000) for j in range(100): obs = env.reset() for i in range(TEST_T): feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask(feasible_actions, n_actions) action, _states = model.predict(obs, mask=action_mask) action = AllocationEnv.check_action(obs['board_config'], action) new_obs, r, dones, info = env.step([action]) results['rewards'].append(r[0] + results['rewards'][-1]) # add (s, a, r, s') to buffer buffer.add(obs_t=State.get_vec_observation(obs),
import os import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from envs.prior import Prior from envs.allocation_env import AllocationEnv from envs.state import State from envs.features import Features import config.config as cfg import matplotlib.pyplot as plt import seaborn as sns import numpy as np import matplotlib as mpl prior = Prior(config=cfg.vals) env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True) env.reset() print("n comp: {}".format(env.state.board_config.sum())) # get seen state ts_train = env.time_stamps.container.data ts_unique = np.unique(ts_train) ts = np.random.choice(ts_unique, ) idx = np.where(ts_train == ts)[0] p_train = env.X_product.container.data r_train = env.X_region.container.data p_state = p_train[idx, :]