def test_fit_observations(): memory = SequentialMemory(100, window_length=2, ignore_episode_boundaries=False) agent = TestAgent(memory) env = TestEnv() agent.compile() agent.fit(env, 20, verbose=0) # Inspect memory to see if observations are correct. experiencies = memory.sample(batch_size=6, batch_idxs=range(2, 8)) assert experiencies[0].reward == .4 assert experiencies[0].action == 3 assert_allclose(experiencies[0].state0, np.array([2, 3])) assert_allclose(experiencies[0].state1, np.array([3, 4])) assert experiencies[0].terminal1 is False assert experiencies[1].reward == .5 assert experiencies[1].action == 4 assert_allclose(experiencies[1].state0, np.array([3, 4])) assert_allclose(experiencies[1].state1, np.array([4, 5])) assert experiencies[1].terminal1 is False assert experiencies[2].reward == .6 assert experiencies[2].action == 5 assert_allclose(experiencies[2].state0, np.array([4, 5])) assert_allclose(experiencies[2].state1, np.array([5, 6])) assert experiencies[2].terminal1 is True # Experience 3 has been re-sampled since since state0 would be terminal in which case we # cannot really have a meaningful transition because the environment gets reset. We thus # just ensure that state0 is not terminal. assert not np.all(experiencies[3].state0 == np.array([5, 6])) assert experiencies[4].reward == .2 assert experiencies[4].action == 1 assert_allclose(experiencies[4].state0, np.array([0, 1])) assert_allclose(experiencies[4].state1, np.array([1, 2])) assert experiencies[4].terminal1 is False assert experiencies[5].reward == .3 assert experiencies[5].action == 2 assert_allclose(experiencies[5].state0, np.array([1, 2])) assert_allclose(experiencies[5].state1, np.array([2, 3])) assert experiencies[5].terminal1 is False
if args.agent_type == 'conv': model = get_conv_model(model) elif args.agent_type == 'rnn': model = get_rnn_model(model) elif args.agent_type == 'drnn': model = get_double_rnn_model(model) elif args.agent_type == 'ntm': model = get_ntm_model() else: raise ValueError('unknown model type: {}'.format(args.agent_type)) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=WINDOW_LENGTH) processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.02, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic.
env = Environment(dataSetPath=DATA_SET_PATH) model = Sequential() model.add(Flatten(input_shape=(WINDOW_SIZE, ) + env.observation_space.shape)) model.add(Dense(64)) model.add(Activation('relu')) model.add(Dense(32)) model.add(Activation('relu')) model.add(Dense(8)) model.add(Activation('relu')) model.add(Dense(env.action_space.n)) model.add(Activation('linear')) # model.summary() memory = SequentialMemory(limit=env.dataLength, window_length=WINDOW_SIZE) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=env.action_space.n, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=0.001), metrics=['mae']) if (os.path.exists(WEIGHTS_NAME)): dqn.load_weights(WEIGHTS_NAME) print("saved weight loaded") def getPredictionAt(index=0):
if sys.argv[2] == 'train': input_shape = (1, ) + env.observation_space.shape # DQNのネットワーク定義 # とりあえずオプションはデフォルト model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(512)) model.add(Dense(512)) model.add(Dense(nb_actions)) print(model.summary()) # experience replay用のmemory # 各ステップごと順番に学習させるわけではく、一度メモリに保存してからランダムに抽出と学習するとか # 正直、完全には理解できていません memory = SequentialMemory(limit=40000, window_length=1) # 行動方策はオーソドックスなepsilon-greedyです。 policy = EpsGreedyQPolicy(eps=0.1) # warmup = 文字通り準備運動のイメージ いきなり学習させずにある程度メモリに貯めると思ってる # update = 学習率 小さくすると時間がかかるし、高くすると過学習しやすくなる dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=0.001)) # nb_steps = 何ステップ学習させるか 数値をめちゃくちゃ大きくして、一晩経ったらCtrl+Cで止めるとかでも別にいい
def get_agent(env, agent_id, model=1): global observation_size # Count number of actions if not ingy: nb_actions = env.action_space['action_movement'][0].shape[0] + 2 # Count number of observations for input if observation_size == 0: observation_size += env.observation_space[ 'observation_self'].shape[0] observation_size += env.observation_space['agent_qpos_qvel'].shape[0] * \ env.observation_space['agent_qpos_qvel'].shape[1] observation_size += env.observation_space['box_obs'].shape[ 0] * env.observation_space['box_obs'].shape[1] observation_size += env.observation_space['ramp_obs'].shape[ 0] * env.observation_space['ramp_obs'].shape[1] # TODO: Not sure whether to include mask_a*_obs and mask_ab_obs_spoof in this observation input -AH else: nb_actions = env.action_space.spaces['action_movement'].spaces[ 0].shape[0][0] + 2 # Count number of observations for input if observation_size == 0: observation_size += env.observation_space.spaces[ 'observation_self'].shape[0] if 'lidar' in env.observation_space.spaces: observation_size += env.observation_space.spaces[ 'lidar'].shape[0] observation_size += env.observation_space.spaces['agent_qpos_qvel'].shape[0] * \ env.observation_space.spaces['agent_qpos_qvel'].shape[1] observation_size += env.observation_space.spaces['box_obs'].shape[0] * \ env.observation_space.spaces['box_obs'].shape[1] observation_size += env.observation_space.spaces['ramp_obs'].shape[0] * \ env.observation_space.spaces['ramp_obs'].shape[1] if model == 1: # Build the actor model actor = Sequential() actor.add(Flatten(input_shape=( 1, observation_size, ))) actor.add(Dense(400)) actor.add(Activation('relu')) actor.add(Dense(300)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('sigmoid')) # Return values from 0 to 1 # print(actor.summary()) # Build the critic model action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=( 1, observation_size, ), name='observation_input') flattened_observation = Flatten()(observation_input) x = Dense(400)(flattened_observation) x = Activation('relu')(x) x = Concatenate()([x, action_input]) x = Dense(300)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) # print(critic.summary()) # Build the agent memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=2.15, mu=0, sigma=3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=4000, nb_steps_warmup_actor=4000, random_process=random_process, gamma=.9, target_model_update=1e-3, processor=MujocoProcessor(agent_id)) agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae']) elif model == 2: # Build the actor model actor = Sequential() actor.add(Flatten(input_shape=( 1, observation_size, ))) actor.add(Dense(400)) actor.add(Activation('relu')) actor.add(Dense(300)) actor.add(Dropout(0.3)) actor.add(Activation('relu')) actor.add(Dense(100)) actor.add(Dropout(0.2)) actor.add(Activation('elu')) actor.add(Dense(50)) actor.add(Dropout(0.2)) actor.add(Activation('elu')) actor.add(Dense(nb_actions)) actor.add(Activation('softmax')) # Return values from 0 to 1 # print(actor.summary()) # Build the critic model action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=( 1, observation_size, ), name='observation_input') flattened_observation = Flatten()(observation_input) x = Dense(400)(flattened_observation) x = Activation('relu')(x) x = Concatenate()([x, action_input]) x = Dense(300)(x) x = Activation('relu')(x) x = Dropout(0.3)(x) x = Dense(100)(x) x = Activation('elu')(x) x = Dropout(0.2)(x) x = Dense(50)(x) x = Activation('elu')(x) x = Dropout(0.2)(x) x = Dense(1)(x) x = Activation('tanh')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) # print(critic.summary()) # Build the agent memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=2.8, mu=0, sigma=3.5) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=500, nb_steps_warmup_actor=500, random_process=random_process, gamma=.9, target_model_update=5e-2, processor=MujocoProcessor(agent_id)) agent.compile([Adam(lr=5e-1, decay=0.9), Adam(lr=5e-1, decay=0.9)], metrics=['mae']) return agent
def training_game(): env = Environment() input_shape = (FLAGS.screen_size, FLAGS.screen_size, 1) nb_actions = 12 # Number of actions model = neural_network_model(input_shape, nb_actions) memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH) processor = SC2Proc() # Policy policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=1, value_min=0.7, value_test=.0, nb_steps=1e6) # Agent dqn = DQNAgent( model=model, nb_actions=nb_actions, memory=memory, enable_double_dqn=False, nb_steps_warmup=500, # nb_steps_warmup=1, target_model_update=1e-2, policy=policy, batch_size=150, processor=processor) dqn.compile(Adam(lr=.001), metrics=["mae"]) # Tensorboard callback callbacks = keras.callbacks.TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=False) # Save the parameters and upload them when needed name = FLAGS.mini_game w_file = "dqn_{}_weights.h5f".format(name) check_w_file = "train_w" + name + "_weights.h5f" if SAVE_MODEL: check_w_file = "train_w" + name + "_weights_{step}.h5f" log_file = "training_w_{}_log.json".format(name) if LOAD_MODEL: dqn.load_weights(w_file) dqn.fit(env, callbacks=callbacks, nb_steps=1e7, action_repetition=2, log_interval=1e4, verbose=2) dqn.save_weights(w_file, overwrite=True) dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
model.add(Dense(32)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) nb_steps = 25000 nb_max_episode_steps_06_05_20_49 = 150 nb_max_episode_steps_06_06_16_24 = 100 nb_max_episode_steps_06_06_16_07 = 200 nb_max_episode_steps = nb_max_episode_steps_06_06_16_24 env.opt_reward = nb_max_episode_steps * 2 memory = SequentialMemory(limit=nb_steps, window_length=1) # 0.1 : 4k, 0.25 : 4k, 0.5 : 7k-inf policy_06_14_16_20 = BoltzmannGumbelQPolicy(C=20.0) # more stable # 0.1 : 4k, 0.25 : 4-5k, 0.5 : 10k-inf policy_06_13_19_00 = BoltzmannQPolicy(tau=1.0) policy_06_14_16_15 = MaxBoltzmannQPolicy(eps=0.1) policy = policy_06_14_16_20 target_model_update_06_05_20_49 = 1e-2 target_model_update_06_05_22_18 = 1e-1 target_model_update_06_13_19_07 = 1e-3 target_model_update = target_model_update_06_05_20_49 bactch_size_06_05_22_18 = 32 bactch_size_07_05_16_07 = 64
model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) logger.info(model.summary()) nb_episode_steps = 60 nb_episodes = 400 # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=nb_episode_steps * nb_episodes, window_length=1) #policy = BoltzmannQPolicy() policy = EpsGreedyQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_max_episode_steps=nb_episode_steps,
conv_1 = Conv2D(64, 3, padding='same')(rehsape_layer) conv1_a = LeakyReLU()(conv_1) flat_layer = Flatten()(conv1_a) dense_1 = Dense(512)(flat_layer) dense_1_a = LeakyReLU()(dense_1) output_layer = Dense(nb_actions, activation='linear')(dense_1_a) masked_layer = multiply([output_layer, mask]) model = Model([input_layer, mask], masked_layer) model.summary() train_mode = len(sys.argv) > 1 and sys.argv[1] == 'train' # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=MEMORY, window_length=WINDOW_LENGTH) policy = BoltzmannQPolicy() processor = SalpakanProcessor(env) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=WARM_UP, target_model_update=1e-2, policy=policy, processor=processor) dqn.compile(Adam(lr=1e-3), metrics=['mae']) if os.path.isfile(WEIGHTS_PATH) and os.access(WEIGHTS_PATH, os.R_OK): dqn.load_weights(WEIGHTS_PATH)
def main(): env = PikaEnv() nb_actions = env.action_space.n model = Sequential() model.add(Flatten(input_shape=(4, ) + env.observation_space.shape)) model.add(Dense(512)) model.add(Activation("relu")) model.add(Dense(512)) model.add(Activation("relu")) model.add(Dense(512)) model.add(Activation("relu")) model.add(Dense(nb_actions)) model.add(Activation("linear")) print(model.summary()) memory = SequentialMemory(limit=1_000_000, window_length=4) policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.05, value_test=0.05, nb_steps=nb_steps // 4, ) dqn = DQNAgent( model=model, nb_actions=nb_actions, policy=policy, memory=memory, enable_dueling_network=True, enable_double_dqn=False, ) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) # dqn.load_weights(log_dir + "load.h5f") weights_filename = log_dir + "dqn_weights.h5f" checkpoint_weights_filename = log_dir + "dqn_weights_{step}.h5f" log_filename = log_dir + "dqn_log.json" callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=10000) ] callbacks += [FileLogger(log_filename, interval=100)] tbCallBack = TensorBoard( log_dir=tb_dir, histogram_freq=0, write_graph=True, write_grads=True, write_images=True, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, ) callbacks += [tbCallBack] dqn.fit( env, callbacks=callbacks, nb_steps=nb_steps, log_interval=10, visualize=True, verbose=2, ) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True)
def build_agent(spec): """Defines a Keras-rl agent, ready for training. :param spec: a Namespace of agent specification options. :return: the rl agent """ env = gym.make(spec.env) n_actions = env.action_space.n # Define network for Atari games if spec.rb_address is None: atari_agent = models.AtariAgent( env_name=spec.env, training=spec.training, one_life=not spec.no_onelife, ) # Define network for Atari games + Restraining bolt else: atari_agent = models.RestrainedAtariAgent( env_name=spec.env, training=spec.training, one_life=not spec.no_onelife, frames_sender=streaming.AtariFramesSender(spec.env), rb_receiver=streaming.StateRewardReceiver(spec.rb_address), ) # Samples are extracted from memory, not observed directly memory = SequentialMemory(limit=spec.memory_limit, window_length=atari_agent.window_length) # Linear dicrease of greedy actions train_policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", value_max=spec.random_max, value_min=spec.random_min, value_test=spec.random_test, nb_steps=spec.random_decay_steps, ) # Test policy: constant eps or per-episode test_policy = (EpsGreedyQPolicy( eps=spec.random_test) if not spec.random_epsilon else models.EpisodeRandomEpsPolicy(min_eps=0.0, max_eps=spec.random_test)) # RL agent dqn = DQNAgent( model=atari_agent.model, enable_double_dqn=True, enable_dueling_network=False, nb_actions=n_actions, policy=train_policy, test_policy=test_policy, memory=memory, processor=atari_agent.processor, nb_steps_warmup=spec.steps_warmup, gamma=spec.gamma, batch_size=spec.batch_size, train_interval=spec.train_interval, target_model_update=spec.target_update, delta_clip=1.0, custom_model_objects=atari_agent.custom_layers, ) dqn.compile(optimizer=Adam(lr=spec.learning_rate), metrics=["mae"]) return dqn, atari_agent
kernel_initializer='zeros', activation='linear')(denses) model = Model(inputs=[ rgbimage_input, dimage_input, velocity_input, distance_input, geofence_input ], outputs=predictions) train = True tb = TensorBoard(log_dir='logs') # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) #reduce memmory processor = MultiInputProcessor(nb_inputs=5) # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05c # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=0.0, nb_steps=100000)
def build_agent(model,actions): policy = BoltzmannQPolicy() memory = SequentialMemory(limit=100000,window_length=1) dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions, nb_steps_warmup=20, target_model_update = 1e-2) return dqn
flattened_observation = Flatten()(observation_input) #? x = concatenate([action_input, flattened_observation]) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(400)(x) x = Activation('relu')(x) x = Dense(300)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) # not included: L2 weight decay critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Set up the agent for training memory = SequentialMemory(limit=1000000, window_length=1) # not same as default random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.noutput) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, delta_clip=1.) # warmup? delta_clip?
from patternmatching.gray.incremental.query_call import load_graph, parse_args from patternmatching.gray.incremental.rl_model import GraphEnv logging.basicConfig(level=logging.INFO) policies = { "bqp": BoltzmannQPolicy(), # Unstable "gqp": GreedyQPolicy(), "egqp": EpsGreedyQPolicy(eps=0.1) # eps should be around 0.1 } window_length = 5 # Should be less than 20 (too large value will not converge Q-values) memories = { "epm": EpisodeParameterMemory(limit=20, window_length=window_length), # Non-episodic "sm": SequentialMemory(limit=20, window_length=window_length) # should use this } argv = sys.argv if len(argv) < 4: print("Usage: python %s [ConfFile] [Policy] [Memory]" % argv[0]) exit(1) policy_name = argv[2] if not policy_name in policies: print("Please specify correct policy name: %s" % str(policies.keys())) exit(1) policy = policies[policy_name] memory_name = argv[3] if not memories in memory_name:
np.random.seed(123) env.seed(123) nb_actions = env.action_space.n model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(NODES)) model.add(PReLU()) model.add(Dense(NODES * 2)) model.add(PReLU()) model.add(Dense(NODES * 4)) model.add(PReLU()) model.add(Dense(NODES * 2)) model.add(PReLU()) model.add(Dense(nb_actions)) model.add(Activation('linear')) memory = SequentialMemory(limit=memoria, window_length=1) policy = EpsGreedyQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, batch_size=batch_size, target_model_update=1e-2, policy=policy, enable_double_dqn=True) dqn.compile(Adam(lr=learning_rate), metrics=['mae']) if not teste: dqn.fit(env, nb_steps=epocas, visualize=False, verbose=1) dqn.save_weights('dqn_weights.h5f', overwrite=True) else: dqn.load_weights('dqn_weights_1.h5f') dqn.test(env, nb_episodes=50, visualize=False)
def main(args): # Supress waring message for CPU os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Breakout environment name default_env = 'BreakoutDeterministic-v0' #default_env = 'BreakoutDeterministic-v4' window_length = 4 nb_steps = 1750000 # learning reate, based on later DeepMind paper called # "Rainbow: Combining Improvements in Deep Reinforcement Learning" # by Hessel et al. 2017 RMSProp was substituted for Adam # with a learning rate of 0.0000625 lr_rate = 0.0000625 # Application mode: train, test # train: Training breakout deep qlearning network # test: Test breakout deep qlearning network with pre-trained model # default_app_mode = 'train' default_app_mode = 'test' # Whether check life lost for new episode default_life_lost_check = False if len(args) == 0: app_mode = default_app_mode else: app_mode = args[0] if len(args) > 1: if args[1] == 'v4': ENV_NAME = 'BreakoutDeterministic-v4' else: ENV_NAME = 'BreakoutDeterministic-v0' else: ENV_NAME = default_env if len(args) > 2: if args[2] == 'check_life_lost': life_check = True else: life_check = False else: life_check = default_life_lost_check INPUT_SHAPE = (84, 84) try: env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n input_frame = Input(shape=(window_length,) + INPUT_SHAPE) dqn_out = Permute((2, 3, 1))(input_frame) # Set he initializer for relu activation function dqn_out = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', kernel_initializer=he_normal())(dqn_out) dqn_out = Conv2D(64, (4, 4), strides=(2, 2), activation='relu', kernel_initializer=he_normal())(dqn_out) dqn_out = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', kernel_initializer=he_normal())(dqn_out) dqn_out = Flatten()(dqn_out) dqn_out = Dense(512)(dqn_out) dqn_out = LeakyReLU()(dqn_out) dqn_out = Dense(nb_actions)(dqn_out) dqn_out = Activation('linear')(dqn_out) model = Model(inputs=[input_frame], outputs=[dqn_out]) print(model.summary()) memory = SequentialMemory(limit=nb_steps, window_length=window_length) #policy = BoltzmannQPolicy() # policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), # attr='eps', # value_max=1., # value_min=.1, # value_test=.05, # nb_steps=1000000) # policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=.001, value_min=.0001, value_test=.00005, nb_steps=1000000) processor = AtariProcessor(input_shape=INPUT_SHAPE) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, # Whether to enable dueling network enable_dueling_network = False, # Training starts afert warm-up steps nb_steps_warmup=50000, processor=processor, #controls how often the target network is updated target_model_update=10000, policy=policy, gamma=.99, train_interval=4, delta_clip=1., ) dqn.compile(Adam(lr=lr_rate), metrics=['mae']) weights_filename = 'dqn_{}_weights.h5f'.format(ENV_NAME) log_filename = 'dqn_{}_log.json'.format(ENV_NAME) if app_mode == 'train': # Load existing weights if exists if os.path.exists(weights_filename): dqn.load_weights(weights_filename) checkpoint_weights_filename = 'dqn_' + ENV_NAME + \ '_weights_{step}.h5f' # Create step_logger and espisode_loger to monitor training # step_filename = 'dqn_' + ENV_NAME + '_step.csv' # episode_filename = 'dqn_' + ENV_NAME + '_episode.csv' # step_logger = StepLogger(step_filename) # episode_logger = EpisodeLogger(episode_filename) # callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)] callbacks += [FileLogger(log_filename, interval=100)] # Add step_logger and espisode_logger as callbacks # callbacks += [LambdaCallback( # on_step_end=step_logger.on_step_end)] # callbacks += [LambdaCallback( # on_episode_end=episode_logger.on_episode_end)] # dqn.fit(env, callbacks=callbacks, log_interval=10000, nb_steps=nb_steps, visualize=False, # To avoid hungs when all lifes are lost nb_max_episode_steps=4000, verbose=2, #whether check life lost and start new episode enable_life_lost_episode=life_check) # Save weights after training completed dqn.save_weights(weights_filename, overwrite=True) env.reset() # Evaluation 5 episodes to show training results dqn.test(env, nb_episodes=5, visualize=False) elif app_mode == 'test': awards_list = [] #env.reset() csv_filename = 'dqn_' + ENV_NAME + '_test.csv' csv_logger = AwardLogger(csv_filename) def print_test_logs(batch, logs): #print(batch) #print(logs) awards_list.append(logs['episode_reward']) callbacks = [LambdaCallback(on_episode_end=print_test_logs)] callbacks += [LambdaCallback(on_step_end=csv_logger.on_step_end)] callbacks += [LambdaCallback( on_episode_end=csv_logger.on_episode_end)] dqn.load_weights(weights_filename) dqn.test(env, callbacks=callbacks, nb_episodes=5, visualize=True, nb_max_episode_steps=4000) mean_award = np.mean(awards_list) print('Average awards: {0:0.2f}'.format(mean_award)) # show step-award diagram csv_logger.plot_award() elif app_mode == 'plot-model': model_filename = 'dqn_' + ENV_NAME + '_model.pdf' dql_model = dqn.model plot_model(dql_model, to_file=model_filename, show_shapes=True, show_layer_names=False, rankdir='TB') elif app_mode == 'plot-train': json_log_file = 'dqn_' + ENV_NAME + '_log.json' records = pd.read_json(json_log_file) fig, ax = plt.subplots(2) # plt.plot(records['episode'], records['loss']) fig.suptitle("Loss Value vs Espisode Reward") ax[0].plot(records['episode'], records['loss'], label='losss') ax[1].plot(records['episode'], records['episode_reward'], label='reward') ax[0].set_ylabel('Losss') ax[1].set_ylabel('Reward') #plt.yticks([0, 0.005, 0.010, 0.050, 0.100]) #plt.title('Loss Value / Mean Q',fontsize=12) #plt.legend(loc="upper left") ax[1].set_xlabel("Episode") #ax = plt.gca() #ax.set_xticklabels([]) plt.show() else: print(f"Syntax: python sls_breakout.py " + f"<train | test | plot-model | plot-train> "+ f"[v4|v2] [check_life_lost]") finally: if env is not None: env.close()
def main(): set_gpu_option() # OPTIONS ENV_NAME = 'DDPGEnv-v0' TIME_STEP = 30 # Get the environment and extract the number of actions. PATH_TRAIN = '/home/data/training_x_150.h5' PATH_TEST = '/home/data/test_x_150.h5' """ env = OhlcvEnv(TIME_STEP, path=PATH_TRAIN) env_test = OhlcvEnv(TIME_STEP, path=PATH_TEST) """ store = pd.HDFStore(PATH_TRAIN, mode='r') varieties_list = store.keys() print('varieties_list: ', varieties_list) print('num varieties: ', len(varieties_list)) variety = 'RB' print('variety: ', variety) # get selected features SELECTED_FACTOR_PATH = '~/feature_selection/根据互信息选出的特征,根据重要性排序.csv' selected_factor_df = pd.read_csv(SELECTED_FACTOR_PATH, index_col=0) selected_factor_list = selected_factor_df[variety].to_list() env = DDPGEnv(TIME_STEP, variety=variety, path=PATH_TRAIN, selected_factor_list=selected_factor_list) #env_test = DDPGEnv(TIME_STEP, variety=variety, path=PATH_TEST, selected_factor_list=selected_factor_list) # random seed np.random.seed(123) env.seed(123) nb_actions = env.action_space.shape[0] print('nb_actions: ', nb_actions) print('env.observation_space.shape: ', env.observation_space.shape) print('env.observation_space: ', env.observation_space) # create actor actor = create_actor(input_shape=env.shape, nb_actions=nb_actions) # create critic action_input = Input(shape=(nb_actions,), name='action_input') observation_input = Input(shape=env.shape, name='observation_input') critic = create_critic(action_input, observation_input) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and even the metrics! memory = SequentialMemory(limit=50000, window_length=TIME_STEP) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) ddpg = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, random_process=random_process, gamma=.99, target_model_update=1e-3, processor=DDPGProcessor()) ddpg.compile(optimizer=Adam(lr=1e-3), metrics=['mae']) log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_grads=True) for _ in range(3): ddpg.fit(env, nb_steps=140000, nb_max_episode_steps=140000, visualize=False, verbose=2) """
def controllerb(t, joints, links, joint2, joint3, joint4, joint5, rewardb_ros, joint1, agent, graph2, session2): if agent.value is None: # import keras-rl in NRP through virtual env import site, os site.addsitedir( os.path.expanduser( '~/.opt/tensorflow_venv/lib/python2.7/site-packages')) from keras.models import Model, Sequential from keras.layers import Dense, Activation, Flatten, Input, concatenate from keras.optimizers import Adam, RMSprop from rl.agents import DDPGAgent from rl.memory import SequentialMemory from rl.random import OrnsteinUhlenbeckProcess from keras import backend as K from tensorflow import Session, Graph K.clear_session() obs_shape = (6, ) nb_actions = 5 # create the nets for rl agent # actor net graph2.value = Graph() with graph2.value.as_default(): session2.value = Session() with session2.value.as_default(): actor = Sequential() actor.add(Flatten(input_shape=(1, ) + obs_shape)) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('sigmoid')) clientLogger.info('actor net init') # critic net action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + obs_shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = concatenate([action_input, flattened_observation]) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) clientLogger.info('critic net init') # instanstiate rl agent memory = SequentialMemory(limit=1000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=nb_actions) agent.value = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=10, nb_steps_warmup_actor=10, random_process=random_process, gamma=.99, batch_size=5, target_model_update=1e-3, delta_clip=1.) agent.value.training = True clientLogger.info('rl agent init') PATH = '/home/user/WORK/NRP/NRP-local/Experiments/bf_manipulation_demo/ddpg_weights.h5' if os.path.isfile(PATH): print('loading weights') agent.load_weights(PATH) clientLogger.info('weights loaded') agent.value.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) clientLogger.info('agent compiled - ready to use') #### run steps #graph1.value = Graph() with graph2.value.as_default(): # session1.value = Session() with session2.value.as_default(): import math import numpy as np angle_lower = links.value.pose[5].position.x angle_vel_lower = links.value.pose[7].position.x angle_upper = links.value.pose[9].position.x angle_vel_upper = links.value.pose[12].position.x # clientLogger.info('humerus_angle ', links.value.pose[15].position.y) # clientLogger.info('humerus_ang_vel ', angle_vel_lower) # clientLogger.info('radius_angle ', angle_upper) # clientLogger.info('radius_ang_vel ', angle_vel_lower) observation = np.array([ math.cos(angle_lower), math.sin(angle_lower), angle_vel_lower, math.cos(angle_upper), math.sin(angle_upper), angle_vel_upper ]) # get movement action from agent and publish to robot action = agent.value.forward(observation) clientLogger.info('agent stepped foward') # move robot joint1.send_message(std_msgs.msg.Float64(action[0])) joint2.send_message(std_msgs.msg.Float64(-action[1])) joint3.send_message(std_msgs.msg.Float64(action[2])) joint4.send_message(std_msgs.msg.Float64(action[3])) joint5.send_message(std_msgs.msg.Float64(action[4])) import math reward = \ math.sqrt(math.pow((links.value.pose[57].position.x - links.value.pose[4].position.x),2) + \ math.pow((links.value.pose[57].position.x - links.value.pose[4].position.x),2) + \ math.pow((links.value.pose[57].position.x - links.value.pose[4].position.x),2)) clientLogger.info('REWARD IS:', reward) rewardb_ros.send_message(reward) ## reward x müsste minimiert für runter! #-(angle_lower**2 + 0.1*angle_vel_lower**2 + # angle_upper**2 + 0.1*angle_vel_upper**2 + # 0.001*np.sum(np.power(action, 2))) #learn from the reward agent.value.backward(reward) clientLogger.info('agent stepped backward') agent.value.step = agent.value.step + 1 if agent.value.step % 20 == 0: clientLogger.info('saving weights') PATH = '/home/user/Desktop/keras_learning_weights/ddpg_weights_b.h5' agent.value.save_weights(PATH, overwrite=True) clientLogger.info('-------one step done')
args = parser.parse_args() # Get the environment and extract the number of actions. env = gym.make(args.env_name) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n # Next, we build our model. We use the same model that was described by Mnih et al. (2015). input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE model = models.get_model(args.model, input_shape, nb_actions) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=args.memory_size, window_length=WINDOW_LENGTH) processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic.
def test_quantile_regression(self): nb_inputs = 10 nb_actions = 3 nb_quantiles = 32 batch_size = 64 delta_clip = 1 model = NetworkMLPDistributional(nb_inputs=nb_inputs, nb_outputs=nb_actions, nb_hidden_layers=2, nb_hidden_neurons=100, nb_quantiles=nb_quantiles, nb_cos_embeddings=64, duel=True, prior=False, activation='relu', duel_type='avg', window_length=1).model policy = LinearAnnealedPolicy(DistributionalEpsGreedyPolicy(eps=1), attr='eps', value_max=1., value_min=0.1, value_test=.0, nb_steps=10000) test_policy = DistributionalEpsGreedyPolicy(eps=0) memory = SequentialMemory(limit=10000, window_length=1) agent = IQNAgent(model=model, policy=policy, test_policy=test_policy, enable_double_dqn=True, nb_samples_policy=nb_quantiles, nb_sampled_quantiles=nb_quantiles, cvar_eta=1, nb_actions=nb_actions, memory=memory, gamma=0.99, batch_size=batch_size, nb_steps_warmup=1000, train_interval=1, memory_interval=1, target_model_update=1000, delta_clip=delta_clip) agent.compile(Adam(lr=0.0001)) plot_model(agent.trainable_model, to_file='trainable_model_2.png', show_shapes=True) # Test input states = np.random.rand(batch_size, 1, nb_inputs) actions = np.random.randint(nb_actions, size=batch_size) test_quantiles = np.linspace(0, 1, nb_quantiles) z_values = agent.model.predict_on_batch( [states, test_quantiles[None, None, :]]) # print(z_values[0]) for i in range(3000): quantiles = np.random.rand(batch_size, 1, nb_quantiles) # targets = np.random.choice([1, 2, 3], batch_size) targets = np.random.choice([10, 22, 35], batch_size) targets = np.repeat(targets[:, None], nb_quantiles, axis=1) predictions = agent.model.predict_on_batch([states, quantiles]) masks = np.zeros((batch_size, nb_actions)) masks[range(batch_size), actions] = 1 targets_expanded = np.zeros((batch_size, nb_quantiles, nb_actions)) targets_expanded[range(batch_size), :, actions] = targets[range(batch_size), :] loss = agent.trainable_model.predict_on_batch( [states, quantiles, targets_expanded, masks]) metrics = agent.trainable_model.train_on_batch( [states, quantiles, targets_expanded, masks], [targets, targets_expanded]) if np.mod(i, 100) == 0: test_quantiles = np.linspace(0, 1, nb_quantiles) z_values = agent.model.predict_on_batch( [states, test_quantiles[None, None, :]]) self.assertTrue(np.abs(np.mean(z_values[:, 1:10, :]) - 10) < 1.0) self.assertTrue(np.abs(np.mean(z_values[:, 12:20, :]) - 22) < 1.0) self.assertTrue(np.abs(np.mean(z_values[:, 23:31, :]) - 35) < 1.0)
def train_dqn_model(layers, rounds=10000): env = gym.make(ENV_NAME) env.seed(1) nb_actions = env.action_space.n window_length = 1 print "nb_actions:" print nb_actions print "env.observation_space.shape:" print env.observation_space.shape model = generate_dense_model( (window_length, ) + env.observation_space.shape, layers, nb_actions) policy = EpsGreedyQPolicy() memory = SequentialMemory(limit=1000000, ignore_episode_boundaries=False, window_length=window_length) agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=16, enable_double_dqn=True, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy, batch_size=16) agent.compile(RMSprop(lr=1e-3), metrics=['mae']) #tb_cb = TensorBoard(log_dir='/tmp/log', write_images=1, histogram_freq=1) #cbks = [tb_cb] # play the game. learn something! #nb_max_episode_steps 一次学习周期中最大步数 agent.fit(env, nb_steps=rounds, nb_max_episode_steps=nb_max_episode_steps_train, visualize=False, verbose=2) #print "#################Start Test%################" #agent.test(env, nb_episodes=100) #test_samples=samples_test features_extract = Features(vocabulary_file) spam_checker = Spam_Check() # 根据动作修改当前样本免杀 spam_manipulatorer = Spam_Manipulator() success = 0 sum = 0 shp = (1, ) + tuple(model.input_shape[1:]) for sample in samples_test: #print sample sum += 1 for _ in range(nb_max_episode_steps_test): featurevectors = features_extract.extract(sample) if spam_checker.check_spam(featurevectors) < 1.0: success += 1 print "Bypass spam rule!:" print sample break f = features_extract.extract(sample).reshape(shp) act_values = model.predict(f) action = np.argmax(act_values[0]) sample = spam_manipulatorer.modify(sample, ACTION_LOOKUP[action]) print "Sum:{} Success:{}".format(sum, success) return agent, model
mode = 'predict' if len(sys.argv) < 2 else sys.argv[1] #env = gym.make(ENV_NAME) config_path = PROJECT_ROOT / "data/esquare3/config_engine.json" config = json.load(config_path.open('rt')) env = CityFlowAgent(mode='train', config_path=config_path) #np.random.seed(123) #env.seed(123) model = env.get_model() model.summary() weights_filename = env.weights_filename log_filename = 'dqn_{}_log.json'.format(ENV_NAME) memory = SequentialMemory(limit=1000000, window_length=env.config['WINDOW_LENGTH']) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=150., value_min=0.0, value_test=.05, nb_steps=10000) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=env.action_space.n, policy=policy, memory=memory, nb_steps_warmup=500, gamma=.9, target_model_update=1000, train_interval=100,
# Input Layer model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) # Hidden layers for _ in range(NUM_HIDDEN_LAYERS): model.add(Dense(LAYER_SIZE)) model.add(Activation('relu')) # Output layer model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=NUM_STEPS, window_length=1) # train_policy = BoltzmannQPolicy(tau=0.05) train_policy = EpsGreedyQPolicy() test_policy = GreedyQPolicy() # Compile the agent based on method specified. We use .upper() to convert to # upper case for comparison if METHOD.upper() == 'DUEL_DQN': memory = SequentialMemory(limit=NUM_STEPS, window_length=1) agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2,
x = Concatenate()([action_input, Flatten()(observation_input)]) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x) x = Activation('linear')(x) L_model = Model(inputs=[action_input, observation_input], outputs=x) print(L_model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! processor = PendulumProcessor() memory = SequentialMemory(limit=MEMORY_LIMIT, window_length=WINDOW_LENGHT) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions) agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model, memory=memory, nb_steps_warmup=NB_STEPS_WARMUP, random_process=random_process, gamma=GAMMA, target_model_update=TARGET_MODEL_UPDATE, processor=processor) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
def training_game(): env = Environment( map_name="ForceField", visualize=True, game_steps_per_episode=150, agent_interface_format=features.AgentInterfaceFormat( feature_dimensions=features.Dimensions(screen=64, minimap=32))) input_shape = (_SIZE, _SIZE, 1) nb_actions = 12 # Number of actions model = neural_network_model(input_shape, nb_actions) memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH) processor = SC2Proc() # Policy policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=1, value_min=0.2, value_test=.0, nb_steps=1e2) # Agent dqn = DQNAgent( model=model, nb_actions=nb_actions, memory=memory, enable_double_dqn=True, enable_dueling_network=True, # 2019-07-12 GU Zhan (Sam) when value shape problem, reduce nb_steps_warmup: # nb_steps_warmup=300, target_model_update=1e-2, policy=policy, nb_steps_warmup=500, target_model_update=1e-2, policy=policy, batch_size=150, processor=processor, delta_clip=1) dqn.compile(Adam(lr=.001), metrics=["mae", "acc"]) # Tensorboard callback timestamp = f"{datetime.datetime.now():%Y-%m-%d %I:%M%p}" # 2019-07-12 GU Zhan (Sam) folder name for Lunux: # callbacks = keras.callbacks.TensorBoard(log_dir='./Graph/'+ timestamp, histogram_freq=0, # write_graph=True, write_images=False) # 2019-07-12 GU Zhan (Sam) folder name for Windows: callbacks = keras.callbacks.TensorBoard(log_dir='.\Graph\issgz', histogram_freq=0, write_graph=True, write_images=False) # Save the parameters and upload them when needed name = "agent" w_file = "dqn_{}_weights.h5f".format(name) check_w_file = "train_w" + name + "_weights.h5f" if SAVE_MODEL: check_w_file = "train_w" + name + "_weights_{step}.h5f" log_file = "training_w_{}_log.json".format(name) if LOAD_MODEL: dqn.load_weights(w_file) class Saver(Callback): def on_episode_end(self, episode, logs={}): if episode % 200 == 0: self.model.save_weights(w_file, overwrite=True) s = Saver() logs = FileLogger('DQN_Agent_log.csv', interval=1) dqn.fit(env, callbacks=[callbacks, s, logs], nb_steps=600, action_repetition=2, log_interval=1e4, verbose=2) dqn.save_weights(w_file, overwrite=True) dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
print(model.summary()) Agent = AGENT_DIC[args.agent] if args.agent == 'cem': memory = EpisodeParameterMemory(limit=args.memory_limit, window_length=args.batch_size) agent = Agent(model=model, nb_actions=nb_actions, memory=memory, batch_size=args.batch_size, nb_steps_warmup=args.steps_warmup, train_interval=1, elite_frac=args.elite_frac) agent.compile() elif args.agent == 'dqn': memory = SequentialMemory(limit=args.memory_limit, window_length=args.batch_size) policy = BoltzmannQPolicy() agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=args.batch_size, nb_steps_warmup=args.steps_warmup, target_model_update=1e-2, policy=policy) agent.compile(Adam(lr=1e-3), metrics=['mae']) if args.train == 1: if not args.wandb_flag: # import ipdb; ipdb.set_trace() agent.fit(env, nb_steps=args.nb_steps_train, visualize=False,
def train( self, env, input_fn, max_steps=10000, policy=EpsGreedyQPolicy(), memory=SequentialMemory(limit=1000, window_length=1), target_model_update=10000, gamma=0.99, warmup_steps=None, batch_size=64, summary_steps=100, visualize=False, ): min_memory = max( warmup_steps, batch_size) if warmup_steps is not None else batch_size with tf.Graph().as_default() as graph: inputs = input_fn() print(inputs) tf.train.get_or_create_global_step() ##################### # start model_fn with tf.variable_scope("Model") as model_scope: model_inputs = dict(state=inputs["state0"]) model_q_values = self.model_fn(model_inputs, tf.estimator.ModeKeys.TRAIN, self.params) with tf.variable_scope("Model", reuse=True) as predict_scope: model_inputs = dict(state=inputs["state0"]) predict_q_values = self.model_fn(model_inputs, tf.estimator.ModeKeys.PREDICT, self.params) with tf.variable_scope("TargetModel") as target_scope: target_model_inputs = dict(state=inputs["state1"]) target_q_values = self.model_fn(target_model_inputs, tf.estimator.ModeKeys.PREDICT, self.params) # get variables model_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=model_scope.name) target_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=target_scope.name) not_terminal = 1.0 - tf.cast(inputs["terminal"], tf.float32) action_values = tf.reduce_max(target_q_values, axis=1) target_values = inputs[ "reward"] + gamma * action_values * not_terminal assert action_values.get_shape().as_list( ) == inputs["reward"].get_shape().as_list() assert action_values.get_shape().as_list( ) == not_terminal.get_shape().as_list() model_action_values = utils.select_columns(model_q_values, inputs["action"]) tf.losses.mean_squared_error(target_values, model_action_values) # loss = tf.reduce_mean(loss) loss = tf.losses.get_total_loss() optimizer = tf.train.AdamOptimizer( learning_rate=self.params.learning_rate) with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): train_op = optimizer.minimize( loss, global_step=tf.train.get_global_step(), var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=model_scope.name), ) tf.summary.scalar("target", tf.reduce_mean(target_values)) # end model_fn ##################### ##################### # train stuff tf.summary.scalar("loss", loss) train_summaries = tf.summary.merge_all() if target_model_update >= 1: update_target_op = tf.cond( # global_step % target_model_update == 0 tf.equal( tf.mod(tf.train.get_global_step(), target_model_update), 0, ), lambda: update_target_weights_hard(target_variables, model_variables), lambda: tf.no_op(), ) else: update_target_op = update_target_weights_soft( target_variables, model_variables, target_model_update) final_train_op = tf.group( train_op, update_target_op, ) # train stuff ##################### ##################### # episode stuff episode_length_t = tf.placeholder(tf.int32, name="episode_length") episode_reward_t = tf.placeholder(tf.int32, name="episode_reward") episode_length_summary = tf.summary.scalar("episode_length", episode_length_t) episode_reward_summary = tf.summary.scalar("episode_reward", episode_reward_t) final_episode_op = tf.group() episode_summaries = tf.summary.merge( [episode_length_summary, episode_reward_summary]) # episode stuff ##################### state0_t, reward_t, terminal_t, action_t, state1_t = [ inputs[x] for x in ["state0", "reward", "terminal", "action", "state1"] ] global_variables_initializer = tf.global_variables_initializer() graph.finalize() writer = tf.summary.FileWriter(self.model_dir) with tf.Session(graph=graph) as sess: utils.initialize_or_restore(sess, self.model_dir, global_variables_initializer) current_step = sess.run(tf.train.get_global_step()) state0 = env.reset() _episode_length = 0 _episode_reward = 0.0 for step in range(current_step, max_steps): step_feed = {state0_t: [state0]} predictions = sess.run(predict_q_values, step_feed) action = policy.select_action(q_values=predictions[0]) state1, reward, terminal, _info = env.step(action) if visualize: env.render() # _episode_length += 1 _episode_reward += reward # memory.append(state0, action, reward, terminal) train_fetches = {} train_feed = {} if memory.nb_entries > min_memory: experiences = memory.sample(batch_size) experiences = [list(x) for x in zip(*experiences)] state0_a, action_a, reward_a, state1_a, terminal_a = experiences state0_a = np.squeeze(state0_a) state1_a = np.squeeze(state1_a) train_feed.update({ state0_t: state0_a, action_t: action_a, reward_t: reward_a, state1_t: state1_a, terminal_t: terminal_a, }) train_fetches["train_op"] = final_train_op if step % summary_steps == 0: train_fetches["train_summaries"] = train_summaries if terminal: train_feed[episode_length_t] = _episode_length train_feed[episode_reward_t] = _episode_reward train_fetches["episode_op"] = final_episode_op train_fetches["episode_summaries"] = episode_summaries if step % summary_steps == 0: pass # do training results = sess.run(train_fetches, train_feed) if "train_summaries" in results: writer.add_summary( results["train_summaries"], step, ) if "episode_summaries" in results: writer.add_summary( results["episode_summaries"], step, ) # end step if terminal: state0 = env.reset() # _episode_length = 0 _episode_reward = 0.0 # else: state0 = state1
nb_actions = env.action_space.n # agent network model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(32*scale)) model.add(Activation('relu')) model.add(Dense(16*scale)) model.add(Activation('relu')) model.add(Dense(8*scale)) model.add(Activation('relu')) model.add(Dense(nb_actions, activation='softmax')) print(model.summary()) # spcifications for the RL agent memory = SequentialMemory(limit=replay_size, window_length=win_len) policy = EpsGreedyQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, enable_dueling_network=True, dueling_type='avg', target_model_update=target_model_update, policy=policy) # compiling the model dqn.compile(Adam(lr=lrn_rate), metrics=['mae']) # setting up callbacks for result collection and realtime visualization of the results through tensorboard tensorboard = TensorBoard(log_dir="logs/{}".format(time())) tpl = TrainEpisodeLogger() # finally perform the training----- visualize=False enables training without visualizing the game which speeds up the training process dqn.fit(env, nb_steps=nb_steps, visualize=False, verbose=2, callbacks=[tensorboard, tpl], nb_max_episode_steps=nb_max_episode_steps) # save the model weights
model.add(Dense(50)) model.add(Activation('relu')) model.add(Dense(100)) return model # In[13]: model = nn_model() model.summary() # In[21]: policy = EpsGreedyQPolicy() memory = SequentialMemory(limit=50000, window_length=1) dqn = DQNAgent(model=model, nb_actions=100, memory=memory, nb_steps_warmup=100, target_model_update=1e-3, policy=policy) dqn.compile(Adam(lr=1e-4), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. history = dqn.fit(env, nb_steps=100000, visualize=False, verbose=1) # In[19]: print(history)
def training_game(): env = Environment( map_name="HallucinIce", visualize=True, game_steps_per_episode=150, agent_interface_format=features.AgentInterfaceFormat( feature_dimensions=features.Dimensions(screen=64, minimap=32))) input_shape = (_SIZE, _SIZE, 1) nb_actions = 12 # Number of actions model = neural_network_model(input_shape, nb_actions) memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH) processor = SC2Proc() # Policy policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=1, value_min=0.2, value_test=.0, nb_steps=1e2) # Agent dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, enable_double_dqn=True, enable_dueling_network=True, nb_steps_warmup=500, target_model_update=1e-2, policy=policy, batch_size=150, processor=processor, delta_clip=1) dqn.compile(Adam(lr=.001), metrics=["mae", "acc"]) # Tensorboard callback callbacks = keras.callbacks.TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=False) # Save the parameters and upload them when needed name = "HallDebbugeed" w_file = "dqn_{}_weights.h5f".format(name) check_w_file = "train_w" + name + "_weights.h5f" if SAVE_MODEL: check_w_file = "train_w" + name + "_weights_{step}.h5f" log_file = "training_w_{}_log.json".format(name) if LOAD_MODEL: dqn.load_weights(w_file) dqn.fit(env, callbacks=[callbacks], nb_steps=1e7, action_repetition=2, log_interval=1e4, verbose=2) dqn.save_weights(w_file, overwrite=True) dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)