示例#1
0
            verbose=2,
            learning_starts=LEARNING_START,
            gamma=.2,
            exploration_fraction=0.35,
            exploration_final_eps=0.2)
model.learn(total_timesteps=TIME_STEPS, learning_curve=False, test_t=TEST_T)

with open(f"../data/{store_id}-buffer-d-test.p", 'wb') as f:
    pickle.dump(model.replay_buffer, f)

results = {'rewards': [0.0]}
buffer = ReplayBuffer(size=50000)

for j in range(100):

    obs = env.reset()

    for i in range(TEST_T):
        feasible_actions = AllocationEnv.get_feasible_actions(
            obs["board_config"])
        action_mask = AllocationEnv.get_action_mask(feasible_actions,
                                                    n_actions)
        action, _states = model.predict(obs, mask=action_mask)

        action = AllocationEnv.check_action(obs['board_config'], action)
        new_obs, r, dones, info = env.step([action])

        results['rewards'].append(r[0] + results['rewards'][-1])

        # add (s, a, r, s') to buffer
        buffer.add(obs_t=State.get_vec_observation(obs),
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from envs.prior import Prior
from envs.allocation_env import AllocationEnv
from envs.state import State
from envs.features import Features
import config.config as cfg
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib as mpl

prior = Prior(config=cfg.vals)
env = AllocationEnv(config=cfg.vals, prior=prior, load_model=True)
env.reset()

print("n comp: {}".format(env.state.board_config.sum()))

# get seen state

ts_train = env.time_stamps.container.data
ts_unique = np.unique(ts_train)
ts = np.random.choice(ts_unique, )

idx = np.where(ts_train == ts)[0]

p_train = env.X_product.container.data
r_train = env.X_region.container.data

p_state = p_train[idx, :]