def setup_experiment(conf, name, extype='dqn', use_portfolio=False, experiment_name=None, seed=1): if use_portfolio: n_acc = 50 portfolio = generate_portfolio(n_acc, seed=seed) else: portfolio = None params = Parameters() params.rho = 0.15 actions_bins = np.array([0., 0.2, 0.5, 0.7, 1.0, 1.5]) n_actions = len(actions_bins) # rep_dist = BetaRepayment(params, 0.9, 0.5, 10, MAX_ACCOUNT_BALANCE) rep_dist = UniformRepayment(params) c_env = CollectionsEnv(params=params, repayment_dist=rep_dist, reward_shaping='continuous', randomize_start=True, max_lambda=None, starting_state=np.array([3, 200], dtype=np.float32)) environment = DiscretizedActionWrapper(c_env, actions_bins) if extype == 'dqn': dqn = DQNAgent(environment, name, training=True, config=conf, initialize=False, portfolio=portfolio, experiment_name=experiment_name) elif extype == 'bspline': dqn = DQNAgentPoly(environment, name, training=True, config=conf, portfolio=portfolio, experiment_name=experiment_name) elif extype == 'dqnpenal': dqn = DQNAgentPenalized(environment, name, config=conf, training=True, portfolio=portfolio, experiment_name=experiment_name) else: raise ValueError('Unsupported experiment type.') return dqn
def load(cls, model_path, load_buffer=False): # loads trained model loaded_config = TrainConfigBase.load( os.path.join(model_path, 'train_config.pkl')) loaded_env = CollectionsEnv.load(os.path.join(model_path, 'env.pkl')) loaded_instance = DQNAgent(loaded_env, model_path, loaded_config, initialize=False, training=False) loaded_instance.main_net = tf.keras.models.load_model( os.path.join(model_path, 'main_net.h5')) loaded_instance.main_net.compile() loaded_instance.target_net = loaded_instance.main_net if load_buffer: try: buffer_path = os.path.join(model_path, 'buffer.pkl') with open(buffer_path, 'rb') as f: buffer = pickle.load(f) loaded_instance.memory.buffer = buffer except (FileNotFoundError, IOError): print('No buffer found.') return loaded_instance
tf.summary.scalar('stddev', stddev) tf.summary.histogram('histogram', var) def save(self): #current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") path_model = os.path.join(self.model_dir, 'main_net.h5') path_actions = os.path.join(self.model_dir, 'action_bins.npy') path_memory_buffer = os.path.join(self.model_dir, 'buffer.pkl') self.main_net.save(path_model) self.memory.save(path_memory_buffer) np.save(path_actions, self.env.action_bins) def load(self, model_path): # self.main_net = tf.keras.models.load_model(os.path.join(model_path, 'main_net.h5')) # self.target_net = tf.keras.models.load_model(os.path.join(model_path, 'main_net.h5')) self.main_net.load_weights(os.path.join(model_path, 'main_net.h5')) self.target_net.load_weights(os.path.join(model_path, 'main_net.h5')) self.action_bins = np.load(os.path.join(model_path, 'action_bins.npy')) if __name__ == '__main__': actions_bins = np.array([0, 1.0]) n_actions = len(actions_bins) c_env = CollectionsEnv(reward_shaping='continuous', randomize_start=True, max_lambda=None) environment = DiscretizedActionWrapper(c_env, actions_bins) # environment = StateNormalization(environment) dqn = DQNAgentLattice(environment, 'DDQNLattice', training=True, config=DefaultConfig(), initialize=False) dqn.run()
from learning.collections_env import CollectionsEnv from learning.utils.wrappers import StateNormalization, DiscretizedActionWrapper import numpy as np from copy import deepcopy action_bins = np.array([0, 1.0]) environment = CollectionsEnv(reward_shaping=False) environment = DiscretizedActionWrapper(environment, action_bins) # environment = StateNormalization(environment) print(f"Reseting: {environment.reset()}") print(f"Step: {environment.step(0)}") print(f"Reseting: {environment.reset()}") environment.env.starting_state = 2 #np.array([10, 100]) print(f"Setting start: {environment.env.starting_state}") print(f"Reseting: {environment.reset()}") # # class ImuT: # def __init__(self, a): # self.a = np.array([0]) # self.starting_a = self.a.copy() # # def reset(self): # self.a = self.starting_a.copy() # return self.a # # def step(self):
w = np.linspace(0, 200, w_points) ww, ll = np.meshgrid(w, l) z = np.zeros_like(ww) p = np.zeros_like(ww) for i, xp in enumerate(w): for j, yp in enumerate(l): fixed_obs = np.array([ls[j], ws[i]]) z[j, i] = np.argmax(model.predict_on_batch(fixed_obs[None, :]).numpy().flatten()) fig, ax = plt.subplots(nrows=1, ncols=2) im = ax[0].pcolor(ww, ll, p) cdict = { 'red': ((0.0, 0.25, .25), (0.02, .59, .59), (1., 1., 1.)), 'green': ((0.0, 0.0, 0.0), (0.02, .45, .45), (1., .97, .97)), 'blue': ((0.0, 1.0, 1.0), (0.02, .75, .75), (1., 0.45, 0.45)) } cm = m.colors.LinearSegmentedColormap('my_colormap', cdict, 1024) im = ax[0].pcolor(ww, ll, z, cmap=cm) fig.colorbar(im) fig.show() return model if __name__ == '__main__': from learning.collections_env import CollectionsEnv environ = CollectionsEnv() model = construct_lattice(environ)
if __name__ == '__main__': from dcc import Parameters params = Parameters() params.rho = 0.15 actions_bins = np.array([0, 0.2, 0.5, 1.0]) n_actions = len(actions_bins) # rep_dist = BetaRepayment(params, 0.9, 0.5, 10, MAX_ACCOUNT_BALANCE) rep_dist = UniformRepayment(params) c_env = CollectionsEnv(params=params, repayment_dist=rep_dist, reward_shaping='continuous', randomize_start=True, max_lambda=None, starting_state=np.array([3, 200], dtype=np.float32)) environment = DiscretizedActionWrapper(c_env, actions_bins) portfolio_acc = generate_portfolio(50) experiment_name = None dqn = DQNAgent(environment, 'test_constr_log_every_pricer', training=True, config=DefaultConfig(), initialize=False, portfolio=portfolio_acc, experiment_name=experiment_name)
from collections import defaultdict from learning.collections_env import DiscretizedObservationWrapper, DiscretizedActionWrapper from learning.collections_env import CollectionsEnv import numpy as np Q = defaultdict(float) gamma = 0.99 # Discounting factor alpha = 0.5 # soft update param env = CollectionsEnv() print(f'Action space: {env.action_space}') print(f'State space: {env.observation_space}') env = DiscretizedActionWrapper(env) env = DiscretizedObservationWrapper(env) actions = range(env.action_space.n) print(f'Discretized action space: {env.action_space}') print(f'Discretized state space: {env.observation_space}') def update_Q(s, r, a, s_next, done): max_q_next = max([Q[s_next, a] for a in actions]) # Do not include the next state's value if currently at the terminal state. Q[s, a] += alpha * (r + gamma * max_q_next * (1.0 - done) - Q[s, a]) def act(ob): if np.random.random() < epsilon: # action_space.sample() is a convenient function to get a random action