Exemplo n.º 1
0
    def run(self):

        actions_bins = np.array([0, 0.1, 0.5, 1])
        n_actions = len(actions_bins)

        c_env = CollectionsEnv()
        env = DiscretizedActionWrapper(c_env, actions_bins)

        n_episodes = 1000
        warmup_episodes = 300
        epsilon = 1.0
        epsilon_final = 0.05
        eps_drop = (epsilon - epsilon_final) / warmup_episodes

        total_rewards = np.empty(n_episodes)

        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

        log_dir = 'logs/dqn/' + current_time
        summary_writer = tf.summary.create_file_writer(log_dir)

        for i in range(n_episodes):
            state = env.reset()
            done = False

            score = 0
            while not done:
                action, q_value = self.get_action(state, epsilon)
                next_state, reward, done, info = env.step(action)
                self.append_sample(state, action, reward, next_state, done)
                score += reward

                state = next_state

                if len(self.memory) > self.batch_size:
                    self.update()
                    if i % 20 == 0:
                        self.update_target()

            if epsilon > epsilon_final:
                epsilon = max(epsilon_final, epsilon - eps_drop)

            total_rewards[i] = score
            avg_rewards = total_rewards[max(0, i - 100):(i + 1)].mean()

            with summary_writer.as_default():
                tf.summary.scalar('episode reward', score, step=i)
                tf.summary.scalar('running avg reward(100)',
                                  avg_rewards,
                                  step=i)

            if i % 10 == 0:
                print("episode:", i, "episode reward:", score, "eps:", epsilon,
                      "avg reward (last 100):", avg_rewards)
def setup_experiment(conf,
                     name,
                     extype='dqn',
                     use_portfolio=False,
                     experiment_name=None,
                     seed=1):
    if use_portfolio:
        n_acc = 50
        portfolio = generate_portfolio(n_acc, seed=seed)
    else:
        portfolio = None

    params = Parameters()
    params.rho = 0.15

    actions_bins = np.array([0., 0.2, 0.5, 0.7, 1.0, 1.5])
    n_actions = len(actions_bins)

    # rep_dist = BetaRepayment(params, 0.9, 0.5, 10, MAX_ACCOUNT_BALANCE)
    rep_dist = UniformRepayment(params)

    c_env = CollectionsEnv(params=params,
                           repayment_dist=rep_dist,
                           reward_shaping='continuous',
                           randomize_start=True,
                           max_lambda=None,
                           starting_state=np.array([3, 200], dtype=np.float32))
    environment = DiscretizedActionWrapper(c_env, actions_bins)

    if extype == 'dqn':
        dqn = DQNAgent(environment,
                       name,
                       training=True,
                       config=conf,
                       initialize=False,
                       portfolio=portfolio,
                       experiment_name=experiment_name)
    elif extype == 'bspline':
        dqn = DQNAgentPoly(environment,
                           name,
                           training=True,
                           config=conf,
                           portfolio=portfolio,
                           experiment_name=experiment_name)
    elif extype == 'dqnpenal':
        dqn = DQNAgentPenalized(environment,
                                name,
                                config=conf,
                                training=True,
                                portfolio=portfolio,
                                experiment_name=experiment_name)
    else:
        raise ValueError('Unsupported experiment type.')

    return dqn
Exemplo n.º 3
0
            tf.summary.scalar('stddev', stddev)
            tf.summary.histogram('histogram', var)

    def save(self):
       #current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        path_model = os.path.join(self.model_dir, 'main_net.h5')
        path_actions = os.path.join(self.model_dir, 'action_bins.npy')
        path_memory_buffer = os.path.join(self.model_dir, 'buffer.pkl')

        self.main_net.save(path_model)
        self.memory.save(path_memory_buffer)
        np.save(path_actions, self.env.action_bins)

    def load(self, model_path):

        # self.main_net = tf.keras.models.load_model(os.path.join(model_path, 'main_net.h5'))
        # self.target_net = tf.keras.models.load_model(os.path.join(model_path, 'main_net.h5'))
        self.main_net.load_weights(os.path.join(model_path, 'main_net.h5'))
        self.target_net.load_weights(os.path.join(model_path, 'main_net.h5'))
        self.action_bins = np.load(os.path.join(model_path, 'action_bins.npy'))

if __name__ == '__main__':
    actions_bins = np.array([0, 1.0])
    n_actions = len(actions_bins)
    c_env = CollectionsEnv(reward_shaping='continuous', randomize_start=True, max_lambda=None)
    environment = DiscretizedActionWrapper(c_env, actions_bins)
   #  environment = StateNormalization(environment)

    dqn = DQNAgentLattice(environment, 'DDQNLattice', training=True, config=DefaultConfig(), initialize=False)
    dqn.run()
Exemplo n.º 4
0
from learning.collections_env import CollectionsEnv
from learning.utils.wrappers import StateNormalization, DiscretizedActionWrapper

import numpy as np
from copy import deepcopy

action_bins = np.array([0, 1.0])

environment = CollectionsEnv(reward_shaping=False)
environment = DiscretizedActionWrapper(environment, action_bins)
# environment = StateNormalization(environment)

print(f"Reseting: {environment.reset()}")
print(f"Step: {environment.step(0)}")
print(f"Reseting: {environment.reset()}")

environment.env.starting_state = 2  #np.array([10, 100])
print(f"Setting start: {environment.env.starting_state}")
print(f"Reseting: {environment.reset()}")

#
# class ImuT:
#     def __init__(self, a):
#         self.a = np.array([0])
#         self.starting_a = self.a.copy()
#
#     def reset(self):
#         self.a = self.starting_a.copy()
#         return self.a
#
#     def step(self):