def __init__(self, env: gym.Env, logger=Logger()): nb_actions = env.action_space.shape[0] model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) policy = BoltzmannQPolicy() memory = SequentialMemory(limit=100000, window_length=1) agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) agent.compile(Adam(lr=1e-3), metrics=['mae']) self.agent = agent self.env = env super().__init__(env, logger)
def run(): env = game_env.MeleeEnv() nb_actions = env.action_space.shape[0] actor = build_network(env, nb_actions) critic, action_input = build_critic(env, nb_actions) memory = SequentialMemory(limit=25000) #random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions) agent = DQNAgent( batch_size=1000, nb_actions=nb_actions, model= actor, #processor=Process(), #window_length=4,#critic_action_input=action_input, memory=memory, nb_steps_warmup=100 ) # nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, #random_process=random_process, gamma=.95, target_model_update=1e-1)#, ##delta_range=(-10., 10.)) agent.compile(RMSprop(lr=.0005), metrics=['mae']) agent.fit(env, nb_steps=100000, visualize=True, verbose=1, nb_max_start_steps=100, start_step_policy=lambda x: np.random.randint(nb_actions)) # After training is done, we save the final weights. agent.save_weights('ddpg_{}_weights.h5f'.format( str(random.randrange(0, 100000))), overwrite=True)
def agent(self): nb_actions = self.env.action_space.n model = self.build() print(model.summary()) memory = SequentialMemory(limit=50000, window_length=1) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=32, enable_dueling_network=True, target_model_update=1e-2, policy=InformedBoltzmannGumbelQPolicy(self.env), test_policy=InformedGreedyQPolicy(self.env), batch_size=32, train_interval=32) dqn.compile(Adam(lr=1e-3), metrics=['mae']) if self.initial_weights_file is not None: try: dqn.load_weights(self.initial_weights_file) except: # just skip loading pass return dqn
def build_agent(model, actions): '''Build Agent''' policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.2, nb_steps=10000 ) memory = SequentialMemory( limit=1000000, window_length=3 ) DQN_agent = DQNAgent( model=model, memory=memory, policy=policy, enable_dueling_network=True, dueling_type='avg', nb_actions=actions, nb_steps_warmup=1000 ) DQN_agent.compile(optimizer=Adam(lr=0.00025), metrics=['mae', 'accuracy']) return DQN_agent
def _build_dqn(nb_actions, nb_states): # build network model = Sequential() model.add(Flatten(input_shape=(1, nb_states))) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions, activation='linear')) # build agent memory = SequentialMemory(limit=10240, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy) dqn.compile(Adam(), metrics=['mae']) return dqn
def agent(self): nb_actions = self.env.action_space.n obs_dim = self.env.observation_space.shape model = Sequential() model.add(Flatten(input_shape=(1, obs_dim))) model.add(Dense(nb_actions, activation='linear')) print(model.summary()) memory = SequentialMemory(limit=50000, window_length=1) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=256, enable_dueling_network=True, target_model_update=1e-2, policy=InformedBoltzmannGumbelQPolicy(self.env), test_policy=InformedGreedyQPolicy(self.env), batch_size=128, train_interval=128) dqn.compile(Adam(lr=1e-3), metrics=['mae']) if self.initial_weights_file is not None: dqn.load_weights(self.initial_weights_file) self.train_episodes = 0 return dqn
def test_double_dqn(): env = TwoRoundDeterministicRewardEnv() np.random.seed(123) env.seed(123) random.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Dense(16, input_shape=(1, ))) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) memory = SequentialMemory(limit=1000, window_length=1) policy = EpsGreedyQPolicy(eps=.1) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50, target_model_update=1e-1, policy=policy, enable_double_dqn=True) dqn.compile(Adam(lr=1e-3)) dqn.fit(env, nb_steps=2000, visualize=False, verbose=0) policy.eps = 0. h = dqn.test(env, nb_episodes=20, visualize=False) assert_allclose(np.mean(h.history['episode_reward']), 3.)
def build_agent(model, nb_actions): """ build an agent """ policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr='eps', value_max=MAX_EPSILON, value_min=MIN_EPSILON, value_test=TEST_EPSILON, nb_steps=MAX_STEPS ) memory = SequentialMemory( limit=MAX_STEPS, window_length=WINDOW_WIDTH ) dqn = DQNAgent( model=model, memory=memory, policy=policy, enable_dueling_network=True, dueling_type='avg', nb_actions=nb_actions, nb_steps_warmup=WARMUP_STEPS ) dqn.compile(Adam(learning_rate=LEARNING_RATE), metrics=['mae']) return dqn
def init_dqn(env, nb_actions): """ Initialize the DQN agent using the keras-rl package. :param env: the environment to be played, required to determine the input size :param nb_actions: number of actions :return: DQN Agent """ # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # compile agent memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.model_name = f"DQN" dqn.compile(Adam(lr=1e-3), metrics=['mae']) return dqn
def make_dqn_rl_agent(processor: Processor_56x5, nbr_layers=2, enable_dueling_network: bool = False, enable_double_dqn: bool = True): """ :param processor: :param nbr_layers: :param enable_dueling_network: :param enable_double_dqn: :return: """ model = processor.create_model(nbr_layers=nbr_layers) test_policy = GreedyQPolicy() memory = SequentialMemory(limit=50000, window_length=1) dqn_agent = DQNAgent(model=model, nb_actions=NBR_TICHU_ACTIONS, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, test_policy=test_policy, processor=processor, enable_dueling_network=enable_dueling_network, enable_double_dqn=enable_double_dqn) dqn_agent.compile(Adam(lr=1e-3), metrics=['mae']) return dqn_agent
def build_model(env, num_actions): input = Input(shape=(1, env.observation_space.shape[0])) x = Flatten()(input) x = Dense(128, activation='relu')(x) #128 x = Dense(64, activation='relu')(x) #64 x = Dense(32, activation='relu')(x) #32 output = Dense(num_actions, activation='linear')(x) model = Model(inputs=input, outputs=output) print(model.summary()) memory = SequentialMemory(limit=50000, window_length=1) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=10000) # policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=num_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) return dqn
class DeepAgent: """ This algorithm is trying to use a DQN agent that learns himself just given a gym. After quite some trouble with various error messages, this now at least runs and trains. It does not yet achieve good results. Best result: ??? """ def __init__(self, shape, action_count: int): super().__init__() inp = Input(shape=shape) flat = Flatten()(inp) # Activation: relu, sigmoid, ... hidden1 = Dense(256, activation='relu')(flat) hidden2 = Dense(64, activation='relu')(hidden1) hidden3 = Dense(16, activation='relu')(hidden2) output = Dense(action_count, activation='softmax')(hidden3) self.model = Model(inputs=inp, outputs=output) print(self.model.summary()) self.memory = SequentialMemory(limit=50000, window_length=WINDOW_LENGTH) self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000) self.callbacks = self.build_callbacks("msnake") self.dqn = DQNAgent(model=self.model, nb_actions=action_count, memory=self.memory, nb_steps_warmup=50, target_model_update=1e-2, policy=self.policy) Adam._name = "fix_bug" # https://github.com/keras-rl/keras-rl/issues/345 # Metrics: mae, mse, accuracy # LR: learning rate self.dqn.compile(Adam(lr=1e-5), metrics=['mse']) def build_callbacks(self, env_name): callbacks = [] checkpoint_weights_filename = 'dqn_' + env_name + '_weights_{step}.h5f' callbacks += [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=5000) ] log_filename = 'dqn_{}_log.json'.format(env_name) callbacks += [FileLogger(log_filename, interval=100)] return callbacks
class QLearningAgent(Agent): def __init__(self, state_dim, action_space, epsilon, lr): self._model = self._get_model(state_dim, action_space) self.agent = DQNAgent(self._model, policy=EpsGreedyQPolicy(epsilon), test_policy=EpsGreedyQPolicy(eps=0.01)) self.agent.compile(Adam(lr)) def model_summary(self): print(self._model.summary())
def train(learn_rate, model_update_interval, steps): dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50000, target_model_update=model_update_interval, policy=policy, gamma=.99, train_interval=4) dqn.compile(Adam(lr=learn_rate), metrics=['mae']) dqn.fit(env, nb_steps=steps, verbose=2, visualize=VISUALIZE) dqn.save_weights(SAVEFILE_FOLDER + "/dqn_pong_params.h5f", overwrite=True)
class DqnAgent(Agent): def __init__(self, env: gym.Env, memory=SequentialMemory(limit=50000, window_length=1), logger=Logger(), boxes_resolution=10, nb_steps_warmup=20, hidden_layers=[16, 16, 16], policy=BoltzmannQPolicy(), target_model_update=1e-2, optimizer=Adam(lr=1e-3)): self.env = env if isinstance(boxes_resolution, int): boxes_resolution = (boxes_resolution, ) * len( env.action_space.shape) self.boxes_resolution = boxes_resolution self.nb_actions = np.zeros(boxes_resolution).size model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) # TODO check this for l in hidden_layers: model.add(Dense(l, activation='relu')) model.add(Dense(self.nb_actions, activation='linear')) # TODO move this to util file? self.model = model print("dqn model summary :{0}".format(model.summary())) self.dqn = DQNAgent(model=model, nb_actions=self.nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, target_model_update=target_model_update, policy=policy, processor=DqnProcessor(self.boxes_resolution, env.action_space.low, env.action_space.high)) self.dqn.compile(optimizer=optimizer, metrics=['mae']) super().__init__(env, logger) def act(self, state, explore): action = self.dqn.processor.process_action(self.dqn.forward(state)) return action def train(self, nb_episodes=1000, verbose=2, visualize=True): self.dqn.fit(env=self.env, nb_steps=nb_episodes, verbose=verbose, visualize=visualize)
def setupDQN(cfg, nb_actions, processor): image_in = Input(shape=cfg.input_shape, name='main_input') input_perm = Permute((2, 3, 1), input_shape=cfg.input_shape)(image_in) conv1 = Conv2D(32, (8, 8), activation="relu", strides=(4, 4), name='conv1')(input_perm) conv2 = Conv2D(64, (4, 4), activation="relu", strides=(2, 2), name='conv2')(conv1) conv3 = Conv2D(64, (3, 3), activation="relu", strides=(1, 1), name='conv3')(conv2) conv_out = Flatten(name='flat_feat')(conv3) dense_out = Dense(512, activation='relu')(conv_out) q_out = Dense(nb_actions, activation='linear')(dense_out) model = Model(inputs=[image_in], outputs=[q_out]) print(model.summary()) # hstate_size = int(np.prod(conv3.shape[1:])) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=cfg.memory_limit, window_length=cfg.WINDOW_LENGTH) # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=cfg.nb_steps_annealed_policy) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=cfg.nb_steps_warmup_dqn_agent, gamma=.99, target_model_update=cfg.target_model_update_dqn_agent, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) return dqn
class DeepAgentConvolution: """ This algorithm is trying to use a DQN agent that learns himself just given a gym. At the moment, it cannot successfully work with convolution: Error when checking input: expected input_1 to have 4 dimensions, but got array with shape (1, 1, 20, 10, 3) Best result: ??? """ def __init__(self, shape, action_count: int): super().__init__() inp = Input(shape=shape) # Convolution part (image recognition / feature extraction) conv = Conv2D(16, kernel_size=2, padding="same")(inp) conv = Conv2D(8, kernel_size=2)(conv) # Classification (decision making) flat = Flatten()(conv) # Activation: relu, sigmoid, ... hidden = Dense(256, activation='relu')(flat) hidden = Dense(64, activation='relu')(hidden) hidden = Dense(16, activation='relu')(hidden) output = Dense(action_count, activation='softmax')(hidden) self.model = Model(inputs=inp, outputs=output) print(self.model.summary()) self.memory = SequentialMemory(limit=50000, window_length=WINDOW_LENGTH) self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000) self.callbacks = self.build_callbacks("msnake") self.dqn = DQNAgent(model=self.model, nb_actions=action_count, memory=self.memory, nb_steps_warmup=20, target_model_update=1e-2, policy=self.policy) Adam._name = "fix_bug" # https://github.com/keras-rl/keras-rl/issues/345 # Metrics: mae, mse, accuracy # LR: learning rate self.dqn.compile(Adam(lr=1e-5), metrics=['mse']) def build_callbacks(self, env_name): callbacks = [] checkpoint_weights_filename = 'dqn_' + env_name + '_weights_{step}.h5f' callbacks += [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=5000)] log_filename = 'dqn_{}_log.json'.format(env_name) callbacks += [FileLogger(log_filename, interval=100)] return callbacks
def build_agent(self, mem_file=None, w_file=None): #Create a dummy env to get size of input/output. #Makes it simpler if we ever choose to update env shapes. env = TradingEnv([], "", []) np.random.seed(314) env.seed(314) nb_actions = env.action_space.n obs_dim = env.observation_space.shape[0] model = Sequential() model.add( LSTM(5, input_shape=(7, 4), return_sequences=True)) # 4 features + 1 bias term. 5 neurons model.add(Activation('tanh')) model.add(LSTM(4)) model.add(Activation('tanh')) model.add(Dropout(0.2)) model.add(Dense(4)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) #Best activation for BoltzmanPolicy #policy = EpsGreedyQPolicy(eps=EPS_VAL) #Off policy policy = BoltzmannQPolicy() #Off-policy test_policy = MaxBoltzmannQPolicy() #On-policy memory = None if mem_file is None: memory = SequentialMemory( limit=50000, window_length=7) ## returns observations of len (7,) else: (memory, memory.actions, memory.rewards, memory.terminals, memory.observations) = pickle.load(open(mem_file, "rb")) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, gamma=GAMMA_VAL, nb_steps_warmup=100, policy=policy, test_policy=test_policy) dqn.compile("adam", metrics=['mse']) if w_file is not None: model.load_weights(w_file) return dqn, env, memory
class KerasDQNAgent(Agent): """ Wrapper on Keras DQN agent """ _internal_agent: DQNAgent def __init__(self) -> None: super().__init__() def set_num_states(self, state_dimension: int, num_actions: int) -> None: model = self._build_model(state_dimension, num_actions) memory = SequentialMemory(limit=10000, window_length=1) self._internal_agent = DQNAgent(model=model, nb_actions=num_actions, memory=memory, nb_steps_warmup=1000, target_model_update=1000, gamma=0.99, delta_clip=1) self._internal_agent.compile(Adam(lr=0.0001), metrics=['mae']) def act(self, state: StateT) -> int: return self._internal_agent.forward(state) def update(self, state: StateT, action: int, reward: float, new_state: StateT) -> None: self._internal_agent.backward() def _build_model(self, state_dimension: int, num_actions: int) -> Sequential: model = Sequential() model.add( Dense(units=64, input_shape=(1, state_dimension), activation='relu')) model.add(Dense(units=64, activation='relu')) model.add(Flatten()) model.add(Dense(num_actions, activation='softmax')) return model
def test_duel_dqn(): env = TwoRoundDeterministicRewardEnv() np.random.seed(123) env.seed(123) random.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Dense(16, input_shape=(1,))) model.add(Activation('relu')) model.add(Dense(nb_actions, activation='linear')) memory = SequentialMemory(limit=1000, window_length=1) policy = EpsGreedyQPolicy(eps=.1) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50, target_model_update=1e-1, policy=policy, enable_double_dqn=False, enable_dueling_network=True) dqn.compile(Adam(lr=1e-3)) dqn.fit(env, nb_steps=2000, visualize=False, verbose=0) policy.eps = 0. h = dqn.test(env, nb_episodes=20, visualize=False) assert_allclose(np.mean(h.history['episode_reward']), 3.)
def create(env): nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(config.current.agent_vfn_complexity)) model.add(Activation('relu')) model.add(Dense(config.current.agent_vfn_complexity)) model.add(Activation('relu')) model.add(Dense(config.current.agent_vfn_complexity)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) #print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) return dqn
model.add(Dropout(0.5)) model.add(Dense(1000, activation="relu")) model.summary() # %% # Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and # even the metrics! policy = EpsGreedyQPolicy() memory = SequentialMemory(limit=5000, window_length=1) agent = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=nb_actions, nb_steps_warmup=500, target_model_update=1e-2) agent.compile(Adam(lr=1e-3), metrics=['mse']) # %% # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=50000, visualize=False, verbose=1, nb_max_episode_steps=1000) # %% # After training is done, we save the final weights. agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
#policies: #callback class EpsDecayCallback(Callback): def __init__(self, eps_policy, decay_rate=0.95): self.eps_policy = eps_policy self.decay_rate = decay_rate def on_episode_begin(self, episode, logs={}): self.eps_policy.eps *= self.decay_rate policy = EpsGreedyQPolicy(eps=1.0) memory = SequentialMemory(limit=500000, window_length=1) agent = DQNAgent(model=Network(), policy=policy, memory=memory, enable_double_dqn=False, nb_actions=env.action_space.n, nb_steps_warmup=10, target_model_update=1e-2) agent.compile(optimizer=Adam(lr=0.002, decay=2.25e-05), metrics=['mse']) agent.fit(env=env, callbacks=[EpsDecayCallback(eps_policy=policy, decay_rate=0.975)], verbose=2, nb_steps=300000) agent.save_weights('model.hdf5') agent.test(env=env, nb_episodes=100, visualize=True)
# inputs = layers.Input(shape=(84, 84, 4,)) inputs = layers.Input(shape=(4, ) + state_size) layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs) layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1) layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2) layer4 = layers.Flatten()(layer3) layer5 = layers.Dense(512, activation="relu")(layer4) action = layers.Dense(num_actions, activation="linear")(layer5) return k.Model(inputs=inputs, outputs=action) model = build_model(state_size, num_actions) model.summary() policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=0.1, nb_steps=1000000) memory = SequentialMemory(limit=1000000, window_length=4) agent = DQNAgent(model=model, policy=policy, nb_actions=num_actions, memory=memory, nb_steps_warmup=50000) agent.compile(k.optimizers.Adam(learning_rate=.00025), metrics=['mae']) agent.fit(env, nb_steps=100000, log_interval=1000, visualize=False, verbose=2) agent.save_weights('policy.h5', overwrite=True)
action='store_true', help='reset weights on current model') parser.add_argument('--train', action='store_true', help='train with existing model') parser.add_argument('--visualize', action='store_true', help='visualize model') return parser.parse_args() if __name__ == '__main__': args = build_arg_parser() env = gym.make('MountainCar-v0') model = build_model(env, args.reset_weights) dqn = DQNAgent(model=model, nb_actions=env.action_space.n, memory=SequentialMemory(limit=50000, window_length=1), nb_steps_warmup=10, target_model_update=1e-2, policy=BoltzmannQPolicy()) dqn.compile(Adam(lr=1e-3), metrics=['mae']) if args.train: dqn.fit(env, nb_steps=150000, visualize=False, verbose=2) dqn.save_weights('model.mdl', overwrite=True) if args.visualize: dqn.test(env, nb_episodes=5, visualize=True)
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=1000000, window_length=10) processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) dqn.fit(env, nb_steps=1750000, log_interval=10000, nb_max_episode_steps=50)
class Player: """Mandatory class with the player methods""" def __init__(self, name='DQN', load_model=None, env=None): """Initiaization of an agent""" self.equity_alive = 0 self.actions = [] self.last_action_in_stage = '' self.temp_stack = [] self.name = name self.autoplay = True self.dqn = None self.model = None self.env = env # if load_model: # self.model = self.load_model(load_model) def initiate_agent(self, env, model_name=None, load_memory=None, load_model=None, load_optimizer=None, load_dqn=None, batch_size=500, learn_rate=1e-3): """initiate a deep Q agent""" # tf.compat.v1.disable_eager_execution() self.env = env nb_actions = self.env.action_space.n if load_model: pass # self.model, trainable_model, target_model = self.load_model(load_model) # print(self.model.history) else: pass self.model = Sequential() self.model.add( Dense(512, activation='relu', input_shape=env.observation_space)) self.model.add(Dropout(0.2)) self.model.add(Dense(512, activation='relu')) self.model.add(Dropout(0.2)) self.model.add(Dense(512, activation='relu')) self.model.add(Dropout(0.2)) self.model.add(Dense(nb_actions, activation='linear')) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! if load_memory: # print(load_memory) # exit() try: memory = self.load_memory(load_memory) except: pass else: memory = SequentialMemory(limit=memory_limit, window_length=window_length) self.batch_size = batch_size self.policy = CustomEpsGreedyQPolicy() self.policy.env = self.env self.test_policy = CustomEpsGreedyQPolicy() self.test_policy.eps = 0.05 self.test_policy.env = self.env self.reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=1e-4) nb_actions = env.action_space.n self.dqn = DQNAgent(model=self.model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, target_model_update=1e-2, policy=self.policy, test_policy=self.test_policy, processor=CustomProcessor(), batch_size=self.batch_size, train_interval=train_interval, enable_double_dqn=enable_double_dqn) # timestr = time.strftime("%Y%m%d-%H%M%S") + "_" + str(model_name) # self.tensorboard = MyTensorBoard(log_dir='./Graph/{}'.format(timestr), player=self) self.dqn.compile(Adam(lr=learn_rate), metrics=['mae']) if load_model: self.load_model(load_model) # self.dqn.trainable_model = trainable_model # self.dqn.target_model = target_model # self.reduce_lr = ReduceLROnPlateau if load_optimizer: self.load_optimizer_weights(load_optimizer) def start_step_policy(self, observation): """Custom policy for random decisions for warm up.""" log.info("Random action") _ = observation legal_moves_limit = [ move.value for move in self.env.info['legal_moves'] ] action = np.random.choice(legal_moves_limit) return action def train(self, env_name, batch_size=500, policy_epsilon=0.2): """Train a model""" # initiate training loop train_vars = { 'batch_size': batch_size, 'policy_epsilon': policy_epsilon } timestr = time.strftime("%Y%m%d-%H%M%S") + "_" + str(env_name) tensorboard = TensorBoard(log_dir='./Graph/{}'.format(timestr), histogram_freq=0, write_graph=True, write_images=False) self.dqn.fit(self.env, nb_max_start_steps=nb_max_start_steps, nb_steps=nb_steps, visualize=False, verbose=2, start_step_policy=self.start_step_policy, callbacks=[tensorboard]) self.policy.eps = policy_epsilon self.dqn.save_weights("dqn_{}_model.h5".format(env_name), overwrite=True) # Save memory pickle.dump(self.dqn.memory, open("train_memory_{}.p".format(env_name), "wb")) # Save optimizer weights symbolic_weights = getattr(self.dqn.trainable_model.optimizer, 'weights') optim_weight_values = K.batch_get_value(symbolic_weights) pickle.dump(optim_weight_values, open('optimizer_weights_{}.p'.format(env_name), "wb")) # # Dump dqn # pickle.dump(self.dqn, open( "dqn_{}.p".format(env_name), "wb" )) # Finally, evaluate our algorithm for 5 episodes. self.dqn.test(self.env, nb_episodes=5, visualize=False) def load_model(self, env_name): """Load a model""" # Load the architecture # with open('dqn_{}_json.json'.format(env_name), 'r') as architecture_json: # dqn_json = json.load(architecture_json) self.dqn.load_weights("dqn_{}_model.h5".format(env_name)) # model = keras.models.load_model("dqn_{}_model.h5".format(env_name)) # trainable_model = keras.models.load_model("dqn_{}_trainable_model.h5".format(env_name)) # target_model = keras.models.load_model("dqn_{}_target_model.h5".format(env_name), overwrite=True) # return model, trainable_model, target_model def load_memory(self, model_name): memory = pickle.load(open('train_memory_{}.p'.format(model_name), "rb")) return memory def load_optimizer_weights(self, env_name): optim_weights = pickle.load( open('optimizer_weights_{}.p'.format(env_name), "rb")) self.dqn.trainable_model.optimizer.set_weights(optim_weights) def play(self, nb_episodes=5, render=False): """Let the agent play""" memory = SequentialMemory(limit=memory_limit, window_length=window_length) policy = CustomEpsGreedyQPolicy() class CustomProcessor(Processor): # pylint: disable=redefined-outer-name """The agent and the environment""" def process_state_batch(self, batch): """ Given a state batch, I want to remove the second dimension, because it's useless and prevents me from feeding the tensor into my CNN """ return np.squeeze(batch, axis=1) def process_info(self, info): processed_info = info['player_data'] if 'stack' in processed_info: processed_info = {'x': 1} return processed_info nb_actions = self.env.action_space.n self.dqn = DQNAgent(model=self.model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, target_model_update=1e-2, policy=policy, processor=CustomProcessor(), batch_size=batch_size, train_interval=train_interval, enable_double_dqn=enable_double_dqn) self.dqn.compile(Adam(lr=1e-3), metrics=['mae']) # pylint: disable=no-member self.dqn.test(self.env, nb_episodes=nb_episodes, visualize=render) def action(self, action_space, observation, info): # pylint: disable=no-self-use """Mandatory method that calculates the move based on the observation array and the action space.""" _ = observation # not using the observation for random decision _ = info this_player_action_space = { Action.FOLD, Action.CHECK, Action.CALL, Action.RAISE_POT, Action.RAISE_HALF_POT, Action.RAISE_2POT } _ = this_player_action_space.intersection(set(action_space)) action = None return action
model = Sequential() model.add(Flatten(input_shape=(1, ) + state_size)) model.add(Dense(params.SIZE_HIDDEN_LAYER, activation='relu')) model.add(Dense(params.SIZE_HIDDEN_LAYER, activation='relu')) model.add(Dense(params.SIZE_HIDDEN_LAYER, activation='relu')) model.add(Dense(action_size)) model.add(Activation(params.ACTIVATION_OUTPUT)) ## Set up the agent for training ## memory = SequentialMemory(limit=params.REPLAY_BUFFER_SIZE, window_length=1) agent = DQNAgent(model=model, policy=BoltzmannQPolicy(), memory=memory, nb_actions=action_size) agent.compile(Adam(lr=params.LR_MODEL), metrics=[params.METRICS]) ## Train ## if args.train: check_overwrite('DQN', params.ENV, args.model) history = agent.fit(env, nb_steps=params.N_STEPS_TRAIN, visualize=args.visualize, verbose=1, nb_max_episode_steps=env._max_episode_steps, log_interval=params.LOG_INTERVAL) agent.save_weights(WEIGHTS_FILES, overwrite=True) save_plot_reward('DQN', params.ENV, history, args.model, params.PARAMS) ## Test ## if not args.train:
def deep_q_learning(): """Implementation of kreras-rl deep q learing.""" env_name = 'neuron_poker-v0' stack = 100 env = gym.make(env_name, num_of_players=5, initial_stacks=stack) np.random.seed(123) env.seed(123) env.add_player( EquityPlayer(name='equity/50/50', min_call_equity=.5, min_bet_equity=-.5)) env.add_player( EquityPlayer(name='equity/50/80', min_call_equity=.8, min_bet_equity=-.8)) env.add_player( EquityPlayer(name='equity/70/70', min_call_equity=.7, min_bet_equity=-.7)) env.add_player( EquityPlayer(name='equity/20/30', min_call_equity=.2, min_bet_equity=-.3)) env.add_player(RandomPlayer()) env.add_player(PlayerShell( name='keras-rl', stack_size=stack)) # shell is used for callback to keras rl env.reset() nb_actions = len(env.action_space) # Next, we build a very simple model. from keras import Sequential from keras.optimizers import Adam from keras.layers import Dense, Dropout from rl.memory import SequentialMemory from rl.agents import DQNAgent from rl.policy import BoltzmannQPolicy model = Sequential() model.add( Dense(64, activation='relu', input_shape=env.observation_space)) model.add(Dropout(0.2)) model.add(Dense(64, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(nb_actions, activation='linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=True, verbose=2) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights.h5f'.format(env_name), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=True)
# print(model.summary()) print(model.output._keras_shape) return model if __name__ == '__main__': env = myTGym(episode_type='0', percent_goal_profit=2, percent_stop_loss=5) # s1, s2, s3 = env.reset() # state = aggregate_state(s1, s2, s3) memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() model = build_network() dqn = DQNAgent(model=model, nb_actions=2, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=False, verbose=2) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights.h5f'.format('trading'), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=True)
class Player: """Mandatory class with the player methods""" def __init__(self, name='DQN', load_model=None, env=None): """Initiaization of an agent""" self.equity_alive = 0 self.actions = [] self.last_action_in_stage = '' self.temp_stack = [] self.name = name self.autoplay = True self.dqn = None self.model = None self.env = env if load_model: self.load(load_model) def initiate_agent(self, env): """initiate a deep Q agent""" tf.compat.v1.disable_eager_execution() self.env = env nb_actions = self.env.action_space.n self.model = Sequential() self.model.add( Dense(512, activation='relu', input_shape=env.observation_space)) self.model.add(Dropout(0.2)) self.model.add(Dense(512, activation='relu')) self.model.add(Dropout(0.2)) self.model.add(Dense(512, activation='relu')) self.model.add(Dropout(0.2)) self.model.add(Dense(nb_actions, activation='linear')) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=memory_limit, window_length=window_length) policy = TrumpPolicy() nb_actions = env.action_space.n self.dqn = DQNAgent(model=self.model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, target_model_update=1e-2, policy=policy, processor=CustomProcessor(), batch_size=batch_size, train_interval=train_interval, enable_double_dqn=enable_double_dqn) self.dqn.compile(Adam(lr=1e-3), metrics=['mae']) def start_step_policy(self, observation): """Custom policy for random decisions for warm up.""" log.info("Random action") _ = observation action = self.env.action_space.sample() return action def train(self, env_name): """Train a model""" # initiate training loop timestr = time.strftime("%Y%m%d-%H%M%S") + "_" + str(env_name) tensorboard = TensorBoard(log_dir='./Graph/{}'.format(timestr), histogram_freq=0, write_graph=True, write_images=False) self.dqn.fit(self.env, nb_max_start_steps=nb_max_start_steps, nb_steps=nb_steps, visualize=False, verbose=2, start_step_policy=self.start_step_policy, callbacks=[tensorboard]) # Save the architecture dqn_json = self.model.to_json() Path("dqn_results").mkdir(parents=True, exist_ok=True) with open("dqn_results/dqn_{}_json.json".format(env_name), "w") as json_file: json.dump(dqn_json, json_file) # After training is done, we save the final weights. self.dqn.save_weights('dqn_results/dqn_{}_weights.h5'.format(env_name), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. self.dqn.test(self.env, nb_episodes=5, visualize=False) def load(self, env_name): """Load a model""" # Load the architecture with open('dqn_results/dqn_{}_json.json'.format(env_name), 'r') as architecture_json: dqn_json = json.load(architecture_json) self.model = model_from_json(dqn_json) self.model.load_weights( 'dqn_results/dqn_{}_weights.h5'.format(env_name)) def play(self, nb_episodes=5, render=False): """Let the agent play""" memory = SequentialMemory(limit=memory_limit, window_length=window_length) policy = TrumpPolicy() class CustomProcessor(Processor): # pylint: disable=redefined-outer-name """The agent and the environment""" def process_state_batch(self, batch): """ Given a state batch, I want to remove the second dimension, because it's useless and prevents me from feeding the tensor into my CNN """ return np.squeeze(batch, axis=1) def process_info(self, info): processed_info = info['player_data'] if 'stack' in processed_info: processed_info = {'x': 1} return processed_info nb_actions = self.env.action_space.n self.dqn = DQNAgent(model=self.model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, target_model_update=1e-2, policy=policy, processor=CustomProcessor(), batch_size=batch_size, train_interval=train_interval, enable_double_dqn=enable_double_dqn) self.dqn.compile(Adam(lr=1e-3), metrics=['mae']) # pylint: disable=no-member self.dqn.test(self.env, nb_episodes=nb_episodes, visualize=render) def action(self, action_space, observation, info): # pylint: disable=no-self-use """Mandatory method that calculates the move based on the observation array and the action space.""" _ = observation # not using the observation for random decision _ = info this_player_action_space = { Action.FOLD, Action.CHECK, Action.CALL, Action.RAISE_POT, Action.RAISE_HALF_POT, Action.RAISE_2POT } _ = this_player_action_space.intersection(set(action_space)) action = None return action
class DeepAgent: """ This algorithm is trying to use a DQN agent that learns himself just given a gym. After quite some trouble with various error messages, this now at least runs and trains. It does not yet achieve good results. Best result: ??? """ def __init__(self, shape, initial_randomness: float, action_count: int): super().__init__() model = Sequential() model.add(Input(shape=shape)) model.add(Conv2D(8, (3, 3), activation='relu', input_shape=shape)) model.add(Conv2D(16, (3, 3), activation='relu', input_shape=shape)) model.add(Conv2D(32, (3, 3), activation='relu', input_shape=shape)) model.add(Flatten()) model.add(Dense(64, activation='relu')) model.add(Dense(512, activation='relu')) model.add(Dense(action_count, activation='softmax')) print(model.summary()) self.model = model self.callbacks = self.build_callbacks("msnake") self.processor = RemoveDimensionProcessor() self.memory = SequentialMemory(limit=50000, window_length=1) self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000) self.dqn = DQNAgent(model=self.model, nb_actions=action_count, memory=self.memory, nb_steps_warmup=10, target_model_update=1e-2, policy=self.policy, batch_size=1, processor=self.processor) # https://github.com/keras-rl/keras-rl/issues/345 Adam._name = "fix_bug" # Metrics: mae, mse, accuracy # LR: learning rate self.dqn.compile(Adam(lr=1e-3), metrics=['mse']) self.initial_randomness = initial_randomness def build_callbacks(self, env_name): callbacks = [] checkpoint_weights_filename = 'dqn_' + env_name + '_weights_{step}.h5f' callbacks += [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=5000) ] log_filename = 'dqn_{}_log.json'.format(env_name) callbacks += [FileLogger(log_filename, interval=100)] return callbacks