def test_sarsa(): env = TwoRoundDeterministicRewardEnv() np.random.seed(123) env.seed(123) random.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Dense(16, input_shape=(1,))) model.add(Activation('relu')) model.add(Dense(nb_actions, activation='linear')) policy = EpsGreedyQPolicy(eps=.1) sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=50, policy=policy) sarsa.compile(Adam(lr=1e-3)) sarsa.fit(env, nb_steps=20000, visualize=False, verbose=0) policy.eps = 0. h = sarsa.test(env, nb_episodes=20, visualize=False) assert_allclose(np.mean(h.history['episode_reward']), 3.)
def test_duel_dqn(): env = TwoRoundDeterministicRewardEnv() np.random.seed(123) env.seed(123) random.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Dense(16, input_shape=(1,))) model.add(Activation('relu')) model.add(Dense(nb_actions, activation='linear')) memory = SequentialMemory(limit=1000, window_length=1) policy = EpsGreedyQPolicy(eps=.1) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50, target_model_update=1e-1, policy=policy, enable_double_dqn=False, enable_dueling_network=True) dqn.compile(Adam(lr=1e-3)) dqn.fit(env, nb_steps=2000, visualize=False, verbose=0) policy.eps = 0. h = dqn.test(env, nb_episodes=20, visualize=False) assert_allclose(np.mean(h.history['episode_reward']), 3.)
def get_dqn_agent(side=1.0): ''' prepare a fresh agent ''' processor = DummyProcessor() input_shape = (3, 3) input_x = Input(shape=(1, ) + input_shape) # When instantiating agent network, multiply board # by -1 or +1 depending on which side agent is playing. # This allows agent to otherwise be ambivalent to side. intput_x_sidenorm = Lambda(lambda x: x * side)(input_x) input_x_flat = Flatten()(intput_x_sidenorm) x = Dense(200)(input_x_flat) x = Activation('relu')(x) x = Dense(40)(x) x = Activation('relu')(x) x = keras.layers.concatenate([x, input_x_flat, input_x_flat]) # highway x = Dense(env.action_space.n)(x) predictions = Activation('linear')(x) model = keras.models.Model(inputs=input_x, outputs=predictions) print(model.summary()) # see https://github.com/keras-rl/keras-rl/blob/master/examples/duel_dqn_cartpole.py memory = SequentialMemory(limit=50000, window_length=1) policy = EpsGreedyQPolicy(0.005) dqn = DQNAgent(model=model, nb_actions=env.action_space.n, memory=memory, nb_steps_warmup=1000, target_model_update=1000, policy=policy, enable_double_dqn=True) dqn.compile(Adam(lr=1e-4), metrics=['mae']) return dqn
def build_dqn(hps, input_dim): """Create a DQN agent to be used on lund inputs.""" print('[+] Constructing DQN agent, model setup:') pprint.pprint(hps) # set up the DQN agent model = build_model(hps, input_dim) memory = SequentialMemory(limit=500000, window_length=1) if hps["policy"] == "boltzmann": policy = BoltzmannQPolicy() elif hps["policy"] == "epsgreedyq": policy = EpsGreedyQPolicy() else: raise ValueError("Invalid policy: %s" % hps["policy"]) duelnet = hps["enable_dueling_network"] doubdqn = hps["enable_double_dqn"] agent = DQNAgentGroom(model=model, nb_actions=2, enable_dueling_network=duelnet, enable_double_dqn=doubdqn, memory=memory, nb_steps_warmup=500, target_model_update=1e-2, policy=policy) if hps['optimizer'] == 'Adam': opt = Adam(lr=hps['learning_rate']) elif hps['optimizer'] == 'SGD': opt = SGD(lr=hps['learning_rate']) elif hps['optimizer'] == 'RMSprop': opt = RMSprop(lr=hps['learning_rate']) elif hps['optimizer'] == 'Adagrad': opt = Adagrad(lr=hps['learning_rate']) agent.compile(opt, metrics=['mae']) return agent
def main(): ENV_NAME = 'BreakoutDeterministic-v4' INPUT_SHAPE = (84, 84) WINDOW_LENGTH = 4 # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) np.random.seed(42) env.seed(42) num_actions = env.action_space.n model = build_model(INPUT_SHAPE, num_actions) memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) processor = AtariProcessor() policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) dqn = DQNAgent(model=model, nb_actions=num_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) callbacks = build_callbacks(ENV_NAME) # After training is done, we save the final weights. dqn.load_weights('dqn_BreakoutDeterministic-v4_weights_1750000.h5f') # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=10, visualize=True)
def __init__(self, observation_shape, nb_actions, eps_steps): # First, we build a very simple NN model. model = Sequential() model.add(Flatten(input_shape=(1, ) + observation_shape)) model.add(Dense(16)) model.add(Activation("relu")) model.add(Dense(16)) model.add(Activation("relu")) model.add(Dense(16)) model.add(Activation("relu")) model.add(Dense(nb_actions)) model.add(Activation("linear")) print(model.summary()) # Next, we configure and compile our agent. You can use every # built-in Keras optimizer and even the metrics! memory = SequentialMemory(limit=50000, window_length=1) # policy = BoltzmannQPolicy() policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.1, value_test=0.05, nb_steps=eps_steps, ) self.dqn = DQNAgent( model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000, target_model_update=1e-2, policy=policy, ) self.dqn.compile(Adam(lr=1e-3), metrics=["mae"])
def initiate_agent(self, env): """initiate a deep Q agent""" tf.compat.v1.disable_eager_execution() self.env = env nb_actions = self.env.action_space.n self.model = Sequential() self.model.add( Dense(512, activation='relu', input_shape=env.observation_space)) self.model.add(Dropout(0.2)) self.model.add(Dense(512, activation='relu')) self.model.add(Dropout(0.2)) self.model.add(Dense(512, activation='relu')) self.model.add(Dropout(0.2)) self.model.add(Dense(nb_actions, activation='linear')) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=memory_limit, window_length=window_length) policy = EpsGreedyQPolicy() nb_actions = env.action_space.n self.dqn = DQNAgent(model=self.model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, target_model_update=1e-2, policy=policy, processor=CustomProcessor(), batch_size=batch_size, train_interval=train_interval, enable_double_dqn=enable_double_dqn, enable_dueling_network=enable_dueling_network) self.dqn.compile(Adam(lr=1e-3), metrics=['mae'])
def bootstrapped_train(env): model = default_model(env) # model = lstm_model(env) policy = EpsGreedyQPolicy(eps=0.1) memory = SequentialMemory(limit=100000, window_length=1) dqn = DQNAgent( model=model, nb_actions=env.action_space.n, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy, ) dqn.compile(Adam(lr=1e-3), metrics=["mae"]) print(model.summary()) dqn.fit(env, nb_steps=5000, visualize=False, verbose=1) env.reset() dqn.test(env, nb_episodes=5, visualize=True) env.close()
def dqn_learn(env, n_episodes, alpha, verbose=False, render=False, save_model=False, **kwargs): n_actions = env.action_space.n # defines NN architecture model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(500)) model.add(Activation('relu')) model.add(Dense(200)) model.add(Activation('relu')) model.add(Dense(100)) model.add(Activation('relu')) model.add(Dense(n_actions)) model.add(Activation('linear')) memory = SequentialMemory(limit=1000000, window_length=1) policy = EpsGreedyQPolicy() dqn = DQNAgent(model=model, nb_actions=n_actions, policy=policy, memory=memory, nb_steps_warmup=30000, target_model_update=1e-2) dqn.compile(Adam(lr=alpha), metrics=['mae']) dqn.fit(env, nb_steps=n_episodes, log_interval=1e5) dqn.save_weights(SAVED_MODEL_PATH, overwrite=save_model)
def make_deep_q_network(env, args): model = KerasModelBuilder(input_shape=args["input_shape"], input_window_length=args["input_window_length"], action_number=env.action_space.n, hidden_layer_size=args["hidden_layer_size"], random_seed=args["random_seed"]).build() memory = SequentialMemory(limit=args["replay_memory_size"], window_length=args["input_window_length"]) processor = AtariProcessor(args["input_shape"]) policy = CheckpointAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max1=args["starting_epslon"], value_min1=args["annealed_epslon1"], value_max2=args["annealed_epslon1"], value_min2=args["annealed_epslon2"], value_test=args["annealed_epslon2"], nb_steps1=args["annealed_steps1"], nb_steps2=args["annealed_steps2"], starting_step=args["starting_step"]) dqn = DQNAgent(model=model, nb_actions=env.action_space.n, policy=policy, memory=memory, processor=processor, nb_steps_warmup=args["replay_memory_starting_size"], gamma=args["discount_factor"], target_model_update=args["target_update_frequency"], enable_dueling_network=args["dueling"], enable_double_dqn=args["double_dqn"], train_interval=args["gradient_update_frequency"], delta_clip=1.) dqn.compile(Adam(lr=args["learning_rate"]), metrics=['mae']) return dqn
def dyna_train(cfg, nb_actions, ml_model, model_truncated, sequence_length, hstate_size, processor): env2 = SynthEnv(ml_model, model_truncated, cfg.env, processor, sequence_length, cfg.WINDOW_LENGTH) hidden_in = Input(shape=(1, hstate_size), name='hidden_input') hidden_in_f = Flatten(name='flat_hidden')(hidden_in) dense_out = Dense(512, activation='relu')(hidden_in_f) q_out = Dense(nb_actions, activation='linear')(dense_out) model2 = Model(inputs=[hidden_in], outputs=[q_out]) print(model2.summary()) memory2 = SequentialMemory(limit=cfg.memory_limit, window_length=1) policy2 = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=cfg.nb_steps_annealed_policy) dqn2 = DQNAgent(model=model2, nb_actions=nb_actions, policy=policy2, memory=memory2, nb_steps_warmup=cfg.nb_steps_warmup_dqn_agent, gamma=.99, target_model_update=cfg.target_model_update_dqn_agent, train_interval=4, delta_clip=1.) dqn2.compile(Adam(lr=.00025), metrics=['mae']) '''dyna_weights_filename = 'dyna_dqn_{}_weights.h5f'.format(env_name) dyna_checkpoint_weights_filename = 'dyna_dqn_' + env_name + '_weights_{step}.h5f' dyna_log_filename = 'dyna_dqn_{}_log.json'.format(env_name) dyna_callbacks = [ModelIntervalCheckpoint(dyna_checkpoint_weights_filename, interval=250000)] dyna_callbacks += [FileLogger(dyna_log_filename, interval=100)]''' dqn2.fit(env2, nb_steps=cfg.nb_steps_dqn_fit, log_interval=10000) # callbacks=dyna_callbacks, return dqn2
def __init__(self, stock: str): self.env = gym.make('stockenv-v0', df=read_daily_data(stock)) print(self.env) print(self.env.action_space) print(self.env.observation_space) self.env.seed(123) self.stock = stock memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) processor = StockProcessor(stock) model = self.create_model(30) print("output:", model.output.shape) print("output2:", self.env.action_space.shape) print(list(model.output.shape)) print(list((None, self.env.action_space.shape))) self.dqn = DQNAgent(model=model, nb_actions=self.env.action_space.n, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) self.dqn.compile(Adam(lr=.00025), metrics=['mae'])
def train(): env = gym.make('CartPole-v0') model = model_gen(env) memory = SequentialMemory(limit=50000, window_length=1) # memory replay # epsilon greedy algorithm policy = EpsGreedyQPolicy(eps=0.001) dqn = DQNAgent(model=model, nb_actions=env.action_space.n, gamma=0.99, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) history = dqn.fit(env, nb_steps=50000, visualize=False, verbose=2) with open('data/cartpole_history.json', 'w') as f: json.dump(history.history, f) dqn.save_weights('data/cartpole_dqn.hdf5')
def __init__(self, model, policy=EpsGreedyQPolicy(), enable_double_dqn=True, target_model=None, policy_model=None, nb_max_steps_recurrent_unrolling=100, *args, **kwargs): super(RecurrentDQNAgent, self).__init__(*args, **kwargs) # Validate (important) input. if hasattr(model.output, '__len__') and len(model.output) > 1: raise ValueError('Model "{}" has more than one output. DQN expects a model that has a single output.'.format(model)) if model.output._keras_shape[-1] != self.nb_actions: raise ValueError('Model output "{}" has invalid shape. DQN expects a model that has one dimension for each action, in this case {}.'.format(model.output, self.nb_actions)) # Validate settings for recurrent DQN. self.is_recurrent = True if self.is_recurrent: if enable_double_dqn: raise ValueError('DoubleDQN (`enable_double_dqn = True`) is currently not supported for recurrent Q learning.') memory = kwargs['memory'] if not memory.is_episodic: raise ValueError('Recurrent Q learning requires an episodic memory. You are trying to use it with memory={} instead.'.format(memory)) if nb_max_steps_recurrent_unrolling and not model.stateful: raise ValueError('Recurrent Q learning with max. unrolling requires a stateful model.') if policy_model is None or not policy_model.stateful: raise ValueError('Recurrent Q learning requires a separate stateful policy model with batch_size=1. Please refer to an example to see how to properly set it up.') # Parameters. self.enable_double_dqn = enable_double_dqn self.nb_max_steps_recurrent_unrolling = nb_max_steps_recurrent_unrolling # Related objects. self.model = model self.target_model = target_model self.policy_model = policy_model if policy_model is not None else model self.policy = policy # State. self.reset_states()
def __init__(self, model, nb_actions, policy=None, test_policy=None, gamma=.99, nb_steps_warmup=10, train_interval=1, delta_clip=np.inf, *args, **kwargs): super(SarsaAgent, self).__init__(*args, **kwargs) # Do not use defaults in constructor because that would mean that each instance shares the same # policy. if policy is None: policy = EpsGreedyQPolicy() if test_policy is None: test_policy = GreedyQPolicy() self.model = model self.nb_actions = nb_actions self.policy = policy self.test_policy = policy self.gamma = gamma self.nb_steps_warmup = nb_steps_warmup self.train_interval = train_interval self.delta_clip = delta_clip self.compiled = False self.actions = None self.observations = None self.rewards = None
def main(env_name, nb_steps): # Get the environment and extract the number of actions. env = gym.make(env_name) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n input_shape = (1,) + env.observation_space.shape model = create_nn_model(input_shape, nb_actions) # Finally, we configure and compile our agent. memory = EpisodeParameterMemory(limit=10000, window_length=1) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) agent = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, nb_steps_warmup=1000000, gamma=.99, target_model_update=1000, train_interval=4, delta_clip=1.) agent.compile(Adam(lr=.00025), metrics=['mae']) agent.fit(env, nb_steps=nb_steps, visualize=False, verbose=1) # After training is done, we save the best weights. agent.save_weights(f'dqn_{env_name}_params.h5f', overwrite=True) # Finally, evaluate the agent history = agent.test(env, nb_episodes=100, visualize=False) rewards = np.array(history.history['episode_reward']) print(("Test rewards (#episodes={}): mean={:>5.2f}, std={:>5.2f}, " "min={:>5.2f}, max={:>5.2f}") .format(len(rewards), rewards.mean(), rewards.std(), rewards.min(), rewards.max()))
def main(): ENV_NAME = 'LunarLander-v2' # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) np.random.seed(42) env.seed(42) num_actions = env.action_space.n state_space = env.observation_space.shape[0] print(num_actions) model = build_model(state_space, num_actions) memory = SequentialMemory(limit=50000, window_length=1) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=10000) dqn = DQNAgent(model=model, nb_actions=num_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) callbacks = build_callbacks(ENV_NAME) # After training is done, we save the final weights. dqn.load_weights('dqn_LunarLander-v2_weights_510000.h5f') # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=10, visualize=True)
def main(): # Get the environment and extract the number of actions. environment_name = "FlappyBird-v0" environment = gym.make(environment_name) np.random.seed(666) nb_actions = environment.action_space.n # Build the model. model = build_model((WINDOW_LENGTH, ) + INPUT_SHAPE, nb_actions) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) processor = FlappyBirdProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(optimizers.Adam(lr=.00025), metrics=['mae']) weights_filename = 'dqn_{}_weights.h5f'.format(environment_name) # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! checkpoint_weights_filename = 'dqn_' + environment_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(environment_name) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [TensorboardCallback()] callbacks += [FileLogger(log_filename, interval=100)] dqn.fit(environment, callbacks=callbacks, nb_steps=1750000, log_interval=10000) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes. dqn.test(environment, nb_episodes=10, visualize=False)
def main(model_name, options): # Initialize maze environments. env = gym.make('Pong-v0') #env = gym.make('Taxi-v2') envs = [env] # Setting hyperparameters. nb_actions = env.action_space.n maze_dim = (6400, 1) h_size = 64 # For DQN e_t_size = 64 #For MQN / RMQN context_size = 64 nb_steps_warmup = int(1e5) nb_steps = int(4e5) buffer_size = 8e4 learning_rate = 0.003 target_model_update = 0.999 clipnorm = 10. switch_rate = 50 window_length = 12 memory_size = None # Callbacks log = TrainEpisodeLogger() #tensorboard = TensorBoard(log_dir="./logs/{}".format(model_name)) rl_tensorboard = RLTensorBoard(log_dir="./logs/{}".format(model_name), histogram_freq=100) callbacks = [log, rl_tensorboard] ### Models ### model = None target_model = None # MQN model. if "MQN" in options: memory_size = 12 model = MQNmodel(e_t_size, context_size, memory_size, window_length, nb_actions, maze_dim) target_model = MQNmodel(e_t_size, context_size, memory_size, window_length, nb_actions, maze_dim) # RMQN model. if "RMQN" in options: memory_size = 12 model = RMQNmodel(e_t_size, context_size, memory_size, window_length, nb_actions, maze_dim) target_model = RMQNmodel(e_t_size, context_size, memory_size, window_length, nb_actions, maze_dim) # Distributional MQN model. nb_atoms = 51 v_min = -2. v_max = 2. #model = DistributionalMQNModel(e_t_size, context_size, window_length, nb_actions, nb_atoms, obs_dimensions) #target_model = DistributionalMQNModel(e_t_size, context_size, window_length, nb_actions, nb_atoms, obs_dimensions) # DQN model if "DQN" in options: model = DQNmodel(nb_actions, window_length, h_size, maze_dim) target_model = DQNmodel(nb_actions, window_length, h_size, maze_dim) # Initialize our target model with the same weights as our model. target_model.set_weights(model.get_weights()) # Initialize memory buffer for DQN algorithm. experience = [ SequentialMemory(limit=int(buffer_size / len(envs)), window_length=window_length) for i in range(len(envs)) ] # Learning policy where we initially begin training our agent by making random moves # with a probability of 1, and linearly decrease that probability down to 0.1 over the # course of some arbitrary number of steps. (nb_steps) policy = LinearAnnealedPolicy(inner_policy=EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.1, value_test=0., nb_steps=1e5) # Optional processor. processor = PongProcessor() # processor = MazeProcessor() # Initialize and compile the DQN agent. dqn = DQNAgent(model=model, target_model=target_model, nb_actions=nb_actions, memory=experience, nb_steps_warmup=nb_steps_warmup, target_model_update=target_model_update, policy=policy, processor=processor, batch_size=8) #Initialize experimental Distributional DQN Agent ''' dqn = DistributionalDQNAgent( model=model, target_model=target_model, num_atoms=nb_atoms, v_min=v_min, v_max=v_max, nb_actions=nb_actions, memory=experience, nb_steps_warmup=nb_steps_warmup, target_model_update=target_model_update, policy=policy, #processor=processor, batch_size=32 ) ''' # Compile the agent to check for validity, build tensorflow graph, etc. dqn.compile(RMSprop(lr=learning_rate, clipnorm=clipnorm), metrics=["mae"]) # Weights will be loaded if weight file exists. if os.path.exists("data/{}/{}".format(model_name, model_name + ".h5")): dqn.load_weights("data/{}/{}".format(model_name, model_name + ".h5")) # Train DQN in environment. if "train" in options: dqn.fit(env, nb_steps=nb_steps, verbose=0, callbacks=callbacks) # Visualization / Logging Tools logmetrics(log, model_name) logHyperparameters(model_name, e_t_size=e_t_size, context_size=context_size, h_size=h_size, memory_size=memory_size, learning_rate=learning_rate, target_model_update=target_model_update, clipnorm=clipnorm, window_length=window_length, nb_atoms=nb_atoms, v_min=v_min, v_max=v_max) # Save weights. dqn.save_weights("data/{}/{}".format(model_name, model_name + ".h5")) # Test DQN in environment. if "test" in options: dqn.test(env, nb_episodes=100, visualize=True) #Debugging if "debug" in options: observation = env.reset() outputLayer(dqn.model, np.array(experience[0].sample(32)[0].state0)) #visualizeLayer(dqn.model, dqn.layers[1], observation) return
model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training
model.add(Dense(7)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # log all metrics to TensorBoard tb_callback = TensorBoard(log_dir='./logs/s4a11r5/run3') test_callback = TestCallback(tb_callback) callbacks = [ tb_callback, test_callback ] memory = SequentialMemory(limit=50000, window_length=1) policy = EpsGreedyQPolicy(eps=0.2) # also tried LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.01, value_test=.05, nb_steps=100000) dqn = OriginalDQNAgent(model=model, nb_actions=nb_actions, memory=memory, policy=policy, target_model_update=1e-2, gamma=0.999) # other prams ever tried: target_model_update = 10000 (steps), gamma = 0.9 dqn.compile(Adam(lr=.008), metrics=['mae']) # also tried lr = 0.01, 0.001, 0.0001 dqn.fit(env, nb_steps=840000, visualize=True, verbose=2, callbacks=callbacks) # main function of training """ This is the main process of dqn.fit function: while step < nb_steps: call env.reset() to get initial state compute action in dqn.forward() call env.step() to execute action and get tuple(s',a,r,done,info) call dqn.backward() to train NN and get the metrics """
# sonst neuen Speicher erstellen except: memory = SequentialMemory(limit=MEMORY_SIZE, window_length=STATES) # Versuche Agent zu laden try: agent_ame = fnmatch.filter(os.listdir(daten_pfad), '*_agent.pkl')[-1] nb_training_steps = int(agent_ame.split("_")[0]) NUM_STEPS = NUM_STEPS - nb_training_steps # Anz. Verbleibende Steps agent_filepath = daten_pfad + '/' + agent_ame dqn = pickle.load(open(agent_filepath, "rb")) # sonst erstelle neuen Agent except: TRAIN_POLICY = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=0.05, value_min=0.05, value_test=0.01, nb_steps=NUM_STEPS_ANNEALED) TEST_POLICY = EpsGreedyQPolicy(eps=.01) dqn = DQNAgent(model=model, nb_actions=NUM_ACTIONS_PRO_AGENT, test_policy=TEST_POLICY, policy=TRAIN_POLICY, memory=memory, processor=processor, nb_steps_warmup=NUM_STEPS_WARMUP, gamma=.99, target_model_update=TARGET_MODEL_UPDATE,
data_format="channels_last")(layer0) layer2 = layers.Conv2D(64, 4, strides=2, activation="relu", data_format="channels_last")(layer1) layer3 = layers.Conv2D(64, 3, strides=1, activation="relu", data_format="channels_last")(layer2) layer4 = layers.Flatten()(layer3) layer5 = layers.Dense(512, activation="relu")(layer4) action = layers.Dense(actions, activation="linear")(layer5) return K.Model(inputs=inputs, outputs=action) if __name__ == '__main__': env = gym.make('BreakoutNoFrameskip-v4') state = env.reset() actions = env.action_space.n model = create_q_model(actions) memory = SequentialMemory(limit=1000000, window_length=4) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=850000) process = AtariProcessor() dqn = DQNAgent(model=model, nb_actions=actions, memory=memory, nb_steps_warmup=50000, target_model_update=10000, policy=policy, processor=process, train_interval=4, gamma=.99, delta_clip=1.) dqn.compile(optimizer=Adam(lr=0.00025), metrics=['mae', 'accuracy']) callback = [ModelIntervalCheck('policy.h5', 1000, 1, model)] dqn.fit(env, nb_steps=1750000, callbacks=callback, visualize=True) model.save("policy.h5")
def train( self, env, input_fn, max_steps=10000, policy=EpsGreedyQPolicy(), memory=SequentialMemory(limit=1000, window_length=1), target_model_update=10000, gamma=0.99, warmup_steps=None, batch_size=64, summary_steps=100, save_steps=10000, visualize=False, seed=None, ): min_memory = max( warmup_steps, batch_size) if warmup_steps is not None else batch_size with tf.Graph().as_default() as graph: ##################### # config ##################### if seed is not None: tf.set_random_seed(seed) global_step = tf.train.get_or_create_global_step() ##################### # inputs ##################### inputs = input_fn() state0_t, reward_t, terminal_t, action_t, state1_t = [ inputs[x] for x in ["state0", "reward", "terminal", "action", "state1"] ] print(inputs) ##################### # model_fn ##################### with tf.variable_scope("Model") as model_scope: model_inputs = dict(state=inputs["state0"]) model_q_values = self.model_fn(model_inputs, tf.estimator.ModeKeys.TRAIN, self.params) with tf.variable_scope("Model", reuse=True) as predict_scope: model_inputs = dict(state=inputs["state0"]) predict_q_values = self.model_fn(model_inputs, tf.estimator.ModeKeys.PREDICT, self.params) with tf.variable_scope("TargetModel") as target_scope: target_model_inputs = dict(state=inputs["state1"]) target_q_values = self.model_fn(target_model_inputs, tf.estimator.ModeKeys.PREDICT, self.params) # get variables model_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=model_scope.name) target_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=target_scope.name) not_terminal = 1.0 - tf.cast(inputs["terminal"], tf.float32) action_values = tf.reduce_max(target_q_values, axis=1) target_values = inputs[ "reward"] + gamma * action_values * not_terminal assert action_values.get_shape().as_list( ) == inputs["reward"].get_shape().as_list() assert action_values.get_shape().as_list( ) == not_terminal.get_shape().as_list() model_action_values = utils.select_columns(model_q_values, inputs["action"]) tf.losses.huber_loss(target_values, model_action_values, delta=100.0) # loss = tf.reduce_mean(loss) loss = tf.losses.get_total_loss() optimizer = tf.train.AdamOptimizer( learning_rate=self.params.learning_rate) with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): train_op = optimizer.minimize( loss, global_step=tf.train.get_global_step(), var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=model_scope.name), ) tf.summary.scalar("target", tf.reduce_mean(target_values)) ##################### # train stuff ##################### tf.summary.scalar("loss", loss) train_summaries = tf.summary.merge_all() if target_model_update >= 1: update_target_op = tf.cond( # global_step % target_model_update == 0 tf.equal( tf.mod(tf.train.get_global_step(), target_model_update), 0, ), lambda: update_target_weights_hard(target_variables, model_variables), lambda: tf.no_op(), ) else: update_target_op = update_target_weights_soft( target_variables, model_variables, target_model_update) final_train_op = tf.group( train_op, update_target_op, ) ##################### # episode stuff ##################### episode_length_t = tf.placeholder(tf.int32, name="episode_length") episode_reward_t = tf.placeholder(tf.int32, name="episode_reward") episode_length_summary = tf.summary.scalar("episode_length", episode_length_t) episode_reward_summary = tf.summary.scalar("episode_reward", episode_reward_t) final_episode_op = tf.group() episode_summaries = tf.summary.merge( [episode_length_summary, episode_reward_summary]) ##################### # initializers ##################### global_variables_initializer = tf.global_variables_initializer() saver = tf.train.Saver() writer = tf.summary.FileWriter(self.model_dir) with graph.as_default(), tf.Session(graph=graph) as sess: utils.initialize_or_restore(sess, self.model_dir, global_variables_initializer) graph.finalize() current_step = sess.run(global_step) state0 = env.reset() _episode_length = 0 _episode_reward = 0.0 for step in range(current_step, max_steps): step_feed = {state0_t: [state0]} predictions = sess.run(predict_q_values, step_feed) action = policy.select_action(q_values=predictions[0]) state1, reward, terminal, _info = env.step(action) if visualize: env.render() # _episode_length += 1 _episode_reward += reward # memory.append(state0, action, reward, terminal) train_fetches = {} train_feed = {} if memory.nb_entries > min_memory: experiences = memory.sample(batch_size) experiences = [list(x) for x in zip(*experiences)] state0_a, action_a, reward_a, state1_a, terminal_a = experiences state0_a = np.squeeze(state0_a) state1_a = np.squeeze(state1_a) train_feed.update({ state0_t: state0_a, action_t: action_a, reward_t: reward_a, state1_t: state1_a, terminal_t: terminal_a, }) train_fetches["train_op"] = final_train_op if step % summary_steps == 0: train_fetches["train_summaries"] = train_summaries if step % save_steps == 0: checkpoint_path = os.path.join(self.model_dir, "model.ckpt") saver.save(sess, checkpoint_path, global_step=step) if terminal: train_feed[episode_length_t] = _episode_length train_feed[episode_reward_t] = _episode_reward train_fetches["episode_op"] = final_episode_op train_fetches["episode_summaries"] = episode_summaries # do training results = sess.run(train_fetches, train_feed) if "train_summaries" in results: writer.add_summary( results["train_summaries"], step, ) if "episode_summaries" in results: writer.add_summary( results["episode_summaries"], step, ) # end step if terminal: state0 = env.reset() # _episode_length = 0 _episode_reward = 0.0 # else: state0 = state1
# Pass a Class with extra parameters reference_generator=WienerProcessReferenceGenerator( reference_state='i', sigma_range=(5e-3, 5e-1))) nb_actions = env.action_space.n env = FlattenObservation(env) model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(4)) model.add(LeakyReLU(alpha=0.05)) model.add(Dense(4)) model.add(LeakyReLU(alpha=0.05)) model.add(Dense(nb_actions)) model.add(Activation('linear')) memory = SequentialMemory(limit=15000, window_length=1) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(eps=0.5), 'eps', 0.5, 0.01, 0, 20000) dqn = DQNAgent(model=model, policy=policy, nb_actions=nb_actions, memory=memory, gamma=0.5, batch_size=128, train_interval=1, memory_interval=1) dqn.compile(Adam(), metrics=['mse']) dqn.fit(env, nb_steps=200000, action_repetition=5, verbose=1,
def __init__(self, model, policy=None, test_policy=None, enable_double_dqn=False, enable_dueling_network=False, dueling_type='avg', *args, **kwargs): super(DQNAgent, self).__init__(*args, **kwargs) # Validate (important) input. if hasattr(model.output, '__len__') and len(model.output) > 1: raise ValueError( 'Model "{}" has more than one output. DQN expects a model that has a single output.' .format(model)) if model.output._keras_shape != (None, self.nb_actions): raise ValueError( 'Model output "{}" has invalid shape. DQN expects a model that has one dimension for each action, in this case {}.' .format(model.output, self.nb_actions)) # Parameters. self.enable_double_dqn = enable_double_dqn self.enable_dueling_network = enable_dueling_network self.dueling_type = dueling_type if self.enable_dueling_network: # get the second last layer of the model, abandon the last layer layer = model.layers[-2] nb_action = model.output._keras_shape[-1] # layer y has a shape (nb_action+1,) # y[:,0] represents V(s;theta) # y[:,1:] represents A(s,a;theta) y = Dense(nb_action + 1, activation='linear')(layer.output) # caculate the Q(s,a;theta) # dueling_type == 'avg' # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta))) # dueling_type == 'max' # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta))) # dueling_type == 'naive' # Q(s,a;theta) = V(s;theta) + A(s,a;theta) if self.dueling_type == 'avg': outputlayer = Lambda( lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.mean( a[:, 1:], axis=1, keepdims=True), output_shape=(nb_action, ))(y) elif self.dueling_type == 'max': outputlayer = Lambda( lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.max( a[:, 1:], axis=1, keepdims=True), output_shape=(nb_action, ))(y) elif self.dueling_type == 'naive': outputlayer = Lambda( lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:], output_shape=(nb_action, ))(y) else: assert False, "dueling_type must be one of {'avg','max','naive'}" model = Model(inputs=model.input, outputs=outputlayer) # Related objects. self.model = model if policy is None: policy = EpsGreedyQPolicy() if test_policy is None: test_policy = GreedyQPolicy() self.policy = policy self.test_policy = test_policy # State. self.reset_states()
def run(): """Construct and start the environment.""" env = JacoEnv(64, 64, 100, 0.1, 0.8, True) nb_actions = env.real_num_actions # All possible action, where each action is a unit in this vector new_floor_color = list((0.55 - 0.45) * np.random.random(3) + 0.45) + [1.] new_cube_color = list(np.random.random(3)) + [1.] env.change_floor_color(new_floor_color) env.change_cube_color(new_cube_color) encoder = load_model(WEIGHTS_FILE) print("#########################") nb_observation_space = (64, 64, 3) original_input = Input(shape=(WINDOW_LENGTH,) + nb_observation_space) in_layer = [Lambda(lambda x: x[:, i, :, :])(original_input) for i in range(WINDOW_LENGTH)] for layer in encoder.layers: layer.trainable = False print(encoder.summary()) encoder_output = [encoder(x) for x in in_layer] x = Concatenate()(encoder_output) x = Dense(512, activation='relu')(x) x = Dense(512, activation='relu')(x) x = Dense(nb_actions, activation='linear')(x) model = Model(original_input, [x]) print(model.summary()) if MULTI_GPU: model = multi_gpu_model(model, gpus=2) print(model.summary()) num_warmup = 50000 # num_simulated_annealing = 500000 + num_warmup # num_warmup = 0 num_simulated_annealing = 220000 + num_warmup memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=num_simulated_annealing) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, nb_steps_warmup=num_warmup, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) if False: dqn.load_weights("stylegan_dqn_weights") checkpoint_callback = ModelCheckpoint("stylegan_dqn_checkpoint", monitor='episode_reward', verbose=0, save_best_only=True, save_weights_only=True, mode='max', period = 10) history = dqn.fit(env, nb_steps=num_simulated_annealing + 450000, visualize=False, verbose=1, callbacks=[checkpoint_callback]) dqn.save_weights("stylegan_dqn_weights") np.savez_compressed("stylegan_dqn_history", episode_reward=np.asarray(history.history['episode_reward'])) else: dqn.load_weights("stylegan_dqn_weights") print("original domain") source_test_losses = dqn.test(env, nb_episodes=100, visualize=True) np.savez_compressed("myvae_dqn_source_test", episode_reward=np.asarray(source_test_losses.history['episode_reward']), nb_steps=np.asarray(source_test_losses.history['nb_steps'])) print("target domain") new_floor_color = [0.4, 0.6, 0.4, 1.] new_cube_color = [1.0, 0.0, 0.0, 1.] env.change_floor_color(new_floor_color) env.change_cube_color(new_cube_color) target_test_losses = dqn.test(env, nb_episodes=100, visualize=True) np.savez_compressed("myvae_dqn_target_test", episode_reward=np.asarray(target_test_losses.history['episode_reward']), nb_steps=np.asarray(target_test_losses.history['nb_steps']))
model = Sequential() model.add(Dense(128, activation="elu", input_shape=(1, 10))) # Our embedding have shape (1, 10), which affects our hidden layer # dimension and output dimension # Flattening resolve potential issues that would arise otherwise model.add(Flatten()) model.add(Dense(64, activation="elu")) model.add(Dense(n_action, activation="linear")) memory = SequentialMemory(limit=10000, window_length=1) # Ssimple epsilon greedy policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.05, value_test=0, nb_steps=10000, ) # Defining our DQN dqn = DQNAgent( model=model, nb_actions=len(env_player.action_space), policy=policy, memory=memory, nb_steps_warmup=1000, gamma=0.5,
model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) #print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=300000, window_length=1) policy = EpsGreedyQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # After training is done, we save the final weights. dqn.load_weights('dqn_{}_weights_model3.h5f'.format(ENV_NAME)) # Redirect stdout to capture test results old_stdout = sys.stdout sys.stdout = mystdout = io.StringIO()
def init_policy(self, policy_dict): self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), **policy_dict)
model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy,
def main(): """ Initialization of all parameters, neural net, agent, training, validation and testing """ write_model_info( ) # save in a file the parameters you are using for this model # set up Environment and variables if METHOD == trailing: env = TrailEnv(FOLDER, STEPS, train_data, test_data, TEST_POINTS, val_data=VAL_DATA, val_starts=VAL_STARTS, limit_data=DATA_SIZE, one_hot=ONE_HOT, cost=COST, margin=MARGIN, turn=TURN, ce=CE, dp=DP, normalize_in=NORMALIZE_IN, reset_margin=RESET_FROM_MARGIN) else: env = DengEnv(FOLDER, STEPS, train_data, test_data, TEST_POINTS, val_data=VAL_DATA, val_starts=VAL_STARTS, window=WINDOW_LENGTH, limit_data=DATA_SIZE, one_hot=ONE_HOT, cost=COST_D) # set up the model model = set_model(env) memory = SequentialMemory(limit=MEM_SIZE, window_length=WINDOW_LENGTH) # Exploration policy policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1.0, value_min=0.1, value_test=0.05, nb_steps=EXPLORE_STEPS) nb_actions = env.action_space.n # set up number of actions (outputs) # set up keras-rl agent dqn = DQNAgent(model=model, gamma=GAMMA, nb_actions=nb_actions, memory=memory, batch_size=BATCH_SIZE, nb_steps_warmup=1000, target_model_update=TAR_MOD_UP, policy=policy, delta_clip=DELTA_CLIP) dqn.compile(Adam(lr=LR, decay=LR_DEC), metrics=['mse']) if START_FROM_TRAINED: dqn.load_weights(TRAINED_WEIGHTS) if VALIDATE: train_w_validation(env, dqn) else: train(env, dqn) fin_stats(env, STEPS) test(env, dqn)