p_phys=all_configs["p_phys"], p_meas=all_configs["p_meas"], error_model=all_configs["error_model"], use_Y=all_configs["use_Y"], volume_depth=all_configs["volume_depth"], static_decoder=static_decoder) # ------------------------------------------------------------------------------------------- model = build_convolutional_nn(all_configs["c_layers"], all_configs["ff_layers"], env.observation_space.shape, env.num_actions) memory = SequentialMemory(limit=all_configs["buffer_size"], window_length=1) policy = LinearAnnealedPolicy( EpsGreedyQPolicy(masked_greedy=all_configs["masked_greedy"]), attr='eps', value_max=all_configs["max_eps"], value_min=all_configs["final_eps"], value_test=0.0, nb_steps=all_configs["exploration_fraction"]) test_policy = GreedyQPolicy(masked_greedy=True) # ------------------------------------------------------------------------------------------ dqn = DQNAgent(model=model, nb_actions=env.num_actions, memory=memory, nb_steps_warmup=all_configs["learning_starts"], target_model_update=all_configs["target_network_update_freq"], policy=policy, test_policy=test_policy, gamma=all_configs["gamma"],
model = Sequential() model.add(Flatten(input_shape=(WINDOW_LENGTH, 128))) model.add(Dropout(0.2)) model.add(Dense(32, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(nb_actions, activation='linear')) model.summary() NUM_STEPS = 10000 memory = SequentialMemory(limit=round(0.75 * NUM_STEPS), window_length=WINDOW_LENGTH) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=0.05, nb_steps=round(0.8 * NUM_STEPS)) test_policy = EpsGreedyQPolicy(eps=0.05) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, processor=ScoreProcessor(), target_model_update=1e-2, policy=policy, test_policy=test_policy) dqn.compile(Adam(lr=5e-4), metrics=['mae']) tensorboard = create_logger(ENV_NAME) checkpoint = create_model_checkpoint(ENV_NAME,
def main(params=None): """ performs training and evaluation of params :return: model """ if params is None: params = { 'model_type': 'dqn_agent', 'l1_out': 128, 'l2_out': 64, 'gamma': 0.5, 'target_model_update': 1, 'delta_clip': 0.01, 'nb_steps_warmup': 1000 } model_type = 'dqn_agent' env_player = SimpleRLPlayer(battle_format="gen8randombattle") # print('env_player',env_player) # print('help', help(env_player)) env_player2 = SimpleRLPlayer(battle_format="gen8randombattle") opponent = RandomPlayer(battle_format="gen8randombattle") second_opponent = MaxDamagePlayer(battle_format="gen8randombattle") # Output dimension n_action = len(env_player.action_space) # model_params = { # 'n_actions': n_action, # 'l1_out': 128, # 'l2_out': 64, # 'model_type': params['model_type'] # } model_params = params model_params['n_actions'] = n_action model = get_model(model_params) # print('first model summary') # print(model.summary()) # model = Sequential() # model.add(Dense(128, activation="elu", input_shape=(1, 10))) # # # Our embedding have shape (1, 10), which affects our hidden layer # # dimension and output dimension # # Flattening resolve potential issues that would arise otherwise # model.add(Flatten()) # model.add(Dense(64, activation="elu")) # model.add(Dense(n_action, activation="linear")) # elu activation is similar to relu # https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html#elu # determine memory type if params['model_type'] in {'dqn_agent', 'sarsa_agent'}: # memory = SequentialMemory(limit=10000, window_length=1) memory = SequentialMemory(limit=NB_TRAINING_STEPS, window_length=1) else: memory = EpisodeParameterMemory(limit=10000, window_length=1) # Simple epsilon greedy # What is linear annealed policy? # - this policy gives gradually decreasing thresholds for the epsilon greedy policy # - it acts as a wrapper around epsilon greedy to feed in a custom threshold pol_steps = NB_TRAINING_STEPS policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.05, value_test=0, nb_steps=pol_steps, ) # pol_steps = NB_TRAINING_STEPS policy_boltz = BoltzmannQPolicy(tau=1) # policy = LinearAnnealedPolicy( # BoltzmannQPolicy(), # attr="tau", # value_max=1.0, # value_min=0.05, # value_test=0, # nb_steps=pol_steps, # ) policy = policy_boltz # Defining our DQN # model = tf.keras.models.load_model('dqn_v_dqn') if params['model_type'] == 'dqn_agent': dqn = DQNAgent( model=model, nb_actions=len(env_player.action_space), policy=policy, memory=memory, nb_steps_warmup=params['nb_steps_warmup'], gamma=params['gamma'], target_model_update=params['target_model_update'], # delta_clip=0.01, delta_clip=params['delta_clip'], enable_double_dqn=params['enable_double_dqn__'], enable_dueling_network=params['enable_double_dqn__'], dueling_type=params['dueling_type__']) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) elif params['model_type'] == 'sarsa_agent': dqn = SARSAAgent(model=model, nb_actions=len(env_player.action_space), policy=policy, nb_steps_warmup=params['nb_steps_warmup'], gamma=params['gamma'], delta_clip=params['delta_clip']) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) else: # CEMAgent # https://towardsdatascience.com/cross-entropy-method-for-reinforcement-learning-2b6de2a4f3a0 dqn = CEMAgent(model=model, nb_actions=len(env_player.action_space), memory=memory, nb_steps_warmup=params['nb_steps_warmup']) # different compile function dqn.compile() # dqn.compile(Adam(lr=0.00025), metrics=["mae"]) # opponent dqn dqn_opponent = DQNAgent( model=model, nb_actions=len(env_player.action_space), policy=policy, memory=memory, nb_steps_warmup=params['nb_steps_warmup'], gamma=params['gamma'], target_model_update=params['target_model_update'], # delta_clip=0.01, delta_clip=params['delta_clip'], enable_double_dqn=params['enable_double_dqn__'], enable_dueling_network=params['enable_double_dqn__'], dueling_type=params['dueling_type__']) dqn_opponent.compile(Adam(lr=0.00025), metrics=["mae"]) # NB_TRAINING_STEPS = NB_TRAINING_STEPS # rl_opponent = TrainedRLPlayer(model) # Training rounds = 4 n_steps = NB_TRAINING_STEPS // rounds for k in range(rounds): env_player.play_against( env_algorithm=dqn_training, opponent=opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_steps": n_steps }, ) env_player.play_against( env_algorithm=dqn_training, opponent=second_opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_steps": n_steps }, ) name = params["name"] + "_model" model.save(name) # loaded_model = tf.keras.models.load_model(name) # Evaluation print("Results against random player:") env_player.play_against( env_algorithm=dqn_evaluation, opponent=opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_episodes": NB_EVALUATION_EPISODES }, ) print("\nResults against max player:") env_player.play_against( env_algorithm=dqn_evaluation, opponent=second_opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_episodes": NB_EVALUATION_EPISODES }, ) return model
#no duel model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(20)) model.add(Activation('relu')) model.add(Dense(20)) model.add(Activation('relu')) model.add(Dense(20)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.000002, value_test=.05, nb_steps=500000) noDuelAgent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, target_model_update=1e-2, policy=policy) noDuelAgent.compile(Adam(lr=1e-3), metrics=['mae']) hist = noDuelAgent.fit(env, nb_max_episode_steps=10000, visualize=False, verbose=2, nb_steps=500000) reward_his_noDuel = hist.history.get('episode_reward')
def training_game(): env = Environment( map_name="HallucinIce", visualize=True, game_steps_per_episode=150, agent_interface_format=features.AgentInterfaceFormat( feature_dimensions=features.Dimensions(screen=64, minimap=32))) input_shape = (_SIZE, _SIZE, 1) nb_actions = 12 # Number of actions model = neural_network_model(input_shape, nb_actions) memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH) processor = SC2Proc() # Policy policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=1, value_min=0.7, value_test=.0, nb_steps=1e6) # Agent dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, enable_double_dqn=False, nb_steps_warmup=500, target_model_update=1e-2, policy=policy, batch_size=150, processor=processor) dqn.compile(Adam(lr=.001), metrics=["mae"]) # Save the parameters and upload them when needed name = "HallucinIce" w_file = "dqn_{}_weights.h5f".format(name) check_w_file = "train_w" + name + "_weights.h5f" if SAVE_MODEL: check_w_file = "train_w" + name + "_weights_{step}.h5f" log_file = "training_w_{}_log.json".format(name) callbacks = [ModelIntervalCheckpoint(check_w_file, interval=1000)] callbacks += [FileLogger(log_file, interval=100)] if LOAD_MODEL: dqn.load_weights(w_file) dqn.fit(env, callbacks=callbacks, nb_steps=1e7, action_repetition=2, log_interval=1e4, verbose=2) dqn.save_weights(w_file, overwrite=True) dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
def train(index, policy_nb_steps, fit_nb_steps): # Get the environment and extract the number of actions. print("Using environment", environment_name) environment = gym.make(environment_name) environment = CarRacingDiscreteWrapper(environment) np.random.seed(666) nb_actions = environment.action_space.n # Build the model. model = build_model((WINDOW_LENGTH, ) + INPUT_SHAPE, nb_actions) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) processor = CarRacingProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, #nb_steps=1000000 nb_steps=policy_nb_steps) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(optimizers.Adam(lr=.00025), metrics=['mae']) weights_filename = 'dqn_{}_{}_weights.h5f'.format(environment_name, index) # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! checkpoint_weights_filename = 'dqn_' + environment_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(environment_name) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [TensorboardCallback()] callbacks += [FileLogger(log_filename, interval=100)] dqn.fit( environment, callbacks=callbacks, #nb_steps=1750000, nb_steps=fit_nb_steps, log_interval=10000, visualize="visualize" in sys.argv) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True)
model.add(Dense(nb_actions, activation="linear")) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(MaxBoltzmannQPolicy(), attr='eps', value_max=1., value_min=.05, value_test=.001, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! # policy = LinearAnnealedPolicy(BoltzmannQPolicy(), attr='tau', value_max=10., value_min=.1, value_test=.05, nb_steps=1000000) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor,
def main(shape=10, winsize=4, test=False, num_max_test=200, visualize_training=False, start_steps=0, randseed=None, human_mode_sleep=0.02): INPUT_SHAPE = (shape, shape) WINDOW_LENGTH = winsize class SnakeProcessor(Processor): def process_observation(self, observation): # assert observation.ndim == 1, str(observation.shape) # (height, width, channel) assert observation.shape == INPUT_SHAPE return observation.astype( 'uint8') # saves storage in experience memory def process_state_batch(self, batch): # We could perform this processing step in `process_observation`. In this case, however, # we would need to store a `float32` array instead, which is 4x more memory intensive than # an `uint8` array. This matters if we store 1M observations. processed_batch = batch.astype('float32') / 255. return processed_batch def process_reward(self, reward): return reward try: randseed = int(randseed) print(f"set seed to {randseed}") except Exception: print(f"failed to intify seed of {randseed}, making it None") randseed = None env = gym.make('snakenv-v0', gs=shape, seed=randseed, human_mode_sleep=human_mode_sleep) np.random.seed(123) env.seed(123) input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE model = make_model(input_shape, 5) memory = SequentialMemory(limit=100000, window_length=WINDOW_LENGTH) processor = SnakeProcessor() start_policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=0, value_min=0, value_test=0, nb_steps=500000) policy = BoltzmannQPolicy(tau=0.25) interval = 20000 dqn = DQNAgent(model=model, nb_actions=5, policy=policy, memory=memory, processor=processor, nb_steps_warmup=2000, gamma=.99, target_model_update=interval, train_interval=4, delta_clip=1.) dqn.compile(Adam(), metrics=['mae']) weights_filename = 'dqn_snake_weights.h5f' if not test: if os.path.exists('starting_weights.h5'): print('loadin!') model.load_weights('starting_weights.h5') # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format('snake') checkpoint_weights_filename = 'dqn_' + 'snake' + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format('snake') callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=interval) ] callbacks += [ ModelIntervalCheckpoint(weights_filename, interval=interval) ] callbacks += [FileLogger(log_filename, interval=500)] callbacks += [WandbLogger(project="snake-rl")] dqn.fit(env, callbacks=callbacks, nb_steps=10000000, log_interval=10000, visualize=visualize_training, nb_max_start_steps=start_steps) # After training is done, we save the final weights one more time. # dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes. # dqn.test(env, nb_episodes=10, visualize=True, nb_max_episode_steps=100) else: while True: try: dqn.load_weights(weights_filename) except Exception: print("weights not found, waiting") dqn.test(env, nb_episodes=10, visualize=visualize_training, nb_max_episode_steps=num_max_test) time.sleep(3)
class BetaFlapDQN(DQNAgent): def __init__(self, inputs, buffer, sess_id, sess, **kwargs): self.util = Utility() self.sess = sess self.sess_id = sess_id game = inputs['game'] agnt = inputs['agent'] sess = agnt['session'] eps = sess['episode'] mod = inputs['model'] trn = mod['training'] sv = mod['save'] mem = inputs['memory'] '''---Environment Paramters---''' self.env_name = game['name'] self.fps = game['fps'] self.mode = game['difficulty'] self.target = game['target'] self.tick = game['tick'] '''---Episode Parameters---''' self.nb_episodes = sess['max_ep'] self.nb_max_episode_steps = game['fps'] * 60 * eps['max_time'] self.nb_steps = self.nb_max_episode_steps * self.nb_episodes self.nb_steps_warmup = trn['warmup'] self.nb_max_start_steps = trn['max_ep_observe'] self.max_start_steps = trn['warmup'] self.keep_gif_score = eps['keep_gif_score'] '''---Agent / Model Parameters---''' self.name = agnt['name'] self.nb_actions = agnt['action_size'] self.delta_clip = agnt['delta_clip'] self.training = trn['training'] self.verbose = trn['verbose'] self.lr = trn['learn_rate'] self.eps = trn['initial_epsilon'] self.value_max = trn['initial_epsilon'] self.value_min = trn['terminal_epsilon'] self.anneal = trn['anneal'] self.shuffle = trn['shuffle'] self.train_interval = trn['interval'] self.validate = trn['validate'] self.split = trn['split'] self.action_repetition = trn['action_repetition'] self.epochs = trn['epochs'] self.epoch = 1 prec = km.binary_precision() re = km.binary_recall() f1 = km.binary_f1_score() self.metrics = ['accuracy', 'mse', prec, re, f1] self.H = mod['filter_size'] self.alpha = mod['alpha'] self.gamma = mod['gamma'] self.momentum = mod['momentum'] self.decay = mod['decay'] self.target_model_update = mod['target_update'] self.type = mod['type'] self.enable_double_dqn = mod['double_dqn'] self.enable_dueling_network = mod['dueling_network'] self.dueling_type = mod['dueling_type'] self.limit = mem['limit'] self.batch_size = mem['batch_size'] self.window_length = mem['state_size'] self.memory_interval = mem['interval'] self.ftype = sv['ftype'] self.vizualize = sv['visualize'] self.save_full = sv['save_full'] self.save_weights = sv['save_weights'] self.save_json = sv['save_json'] self.save_plot = sv['save_plot'] self.save_interval = sv['save_n'] self.log_interval = sv['log_n'] self.saves = sv['save_path'] self.save_path = self.util.get_save_dir_struct(self.saves, self.env_name) self.logs = sv['log_path'] self.util.display_status('Hyperparameters Successfully Loaded') '''Reference/Excerpt: keras-rl DQN Atari Example https://github.com/keras-rl/keras-rl/blob/master/examples/dqn_atari.py # Select a policy. # We use eps-greedy action selection, which means that a random action # is selected with probability eps. We anneal eps from init to term over # the course of (anneal) steps. This is done so that the agent initially # explores the environment (high eps) and then gradually sticks to # what it knows (low eps). We also set a dedicated eps value that is # used during testing. Note that we set it to 0.05 so that the agent # still performs some random actions. # This ensures that the agent cannot get stuck. # ''' self.custom_model_objects = { 'S': self.window_length, 'A': self.nb_actions, 'H': self.H, 'lr': self.lr, 'name': self.name, 'batch_size': self.batch_size, 'sess': self.sess, #dueling_network=self.enable_dueling_network, #dueling_type=self.dueling_type, } with tf.device(gpu): self.policy = LinearAnnealedPolicy( inner_policy=EpsGreedyQPolicy(eps=self.value_max), attr='eps', value_max=self.value_max, value_min=self.value_min, value_test=self.alpha, nb_steps=self.anneal) self.test_policy = GreedyQPolicy() if mod['optimizer'].lower() == 'adamax': self.optimizer = Adamax(lr=self.lr) elif mod['optimizer'].lower() == 'adadelta': self.optimizer = Adadelta() elif mod['optimizer'].lower() == 'rmsprop': self.optimizer = RMSprop() elif mod['optimizer'].lower() == 'sgd': self.optimizer = SGD( lr=self.lr, momentum=self.momentum, decay=self.decay, ) else: self.optimizer = Adam(lr=self.lr) self.memory = buffer self.log_path = self.util.get_log_dir_struct(self.sess_id, self.logs, self.ftype) self.util.display_status('Keras GPU Session {} Beginning'.format( self.sess_id)) nn = NeuralNet( S=self.window_length, A=self.nb_actions, H=self.H, lr=self.lr, name=self.name, batch_size=self.batch_size, dueling_network=self.enable_dueling_network, dueling_type=self.dueling_type, sess=self.sess, ) with tf.device(gpu): self.model = nn.get_model() self.util.display_status( '{} Keras Agent with {} Optimizer Built'.format( self.name, mod['optimizer'])) '''---Compile the model with chosen optimizer loss is calculated with lamba function based on model type selections (dueling, or double dqn)''' with tf.device(gpu): self.compile( optimizer=self.optimizer, metrics=self.metrics, ) self.util.display_status( '{} Agent Fully Initialized with Compiled Model'.format(self.name)) super(BetaFlapDQN, self).__init__( model=self.model, nb_actions=self.nb_actions, memory=self.memory, policy=self.policy, test_policy=self.test_policy, enable_double_dqn=self.enable_double_dqn, enable_dueling_network=self.enable_dueling_network, dueling_type=self.dueling_type, **kwargs) def load_saved_model_weights(self): try: self.model.load_weights('saved/FlappyBird_weights.h5') self.util.display_status('Saved Keras Model Weights Loaded') except: self.util.display_status('No Saved Keras Model Weights Found') def fit(self, iteration=1, max_iteration=1): self.load_saved_model_weights() with tf.device(gpu): self.env = Environment( target_score=self.target, difficulty=self.mode, fps=self.fps, tick=self.tick, ) self.util.display_status('{} Environment Emulation Initialized'.format( self.env_name)) if self.action_repetition < 1: raise ValueError( 'action_repetition must be >= 1, is {}'.\ format(self.action_repetition) ) '''---Define Custom Callbacks and Processors BetaFlap''' FlappyCall = FlappySession() Flappy = FlappyProcessor() '''---Flag Agent with as Training with on_train_begin()''' self._on_train_begin() FlappyCall.on_train_begin() self.training = True observation = None reward = None done = False info = None status = 'play' episode = np.int16(0) self.step = np.int16(0) action = np.int16(0) self.randQ = np.int16(0) self.reward = np.float16(0) idx = np.int16(0) flap = False episode_reward = None episode_score = None episode_step = None did_abort = False '''---Begin stepping through Episodes---''' # continue while global step is < max session steps while self.step < self.nb_steps: gc.collect() if observation is None: # new episode '''---Initialize Environment with No Action''' FlappyCall.on_episode_begin(episode) self.reset_states() # reset all episode tracking parameters reward = None done = False info = {} action = None episode_step = np.int16(0) episode_score = np.int16(0) episode_reward = np.float32(0) wake = np.zeros([self.nb_actions]) # [0, 0] wake[0] = 1 # [1, 0] --> don't flap o, r, done, info = self.env.step(wake) # progress env 1 frame observation, r = Flappy.process_step(o, r, done, info) assert observation is not None '''---Each episode, begin with n random actions/steps''' if self.nb_max_start_steps == 0: self.nb_random_start_steps = 0 else: self.nb_random_start_steps = \ np.random.randint(self.nb_max_start_steps) '''---Perform random nb steps w/ rand action without adding them to experience replay memory''' for _ in range(self.nb_random_start_steps): action = np.zeros([self.nb_actions]) randQ = rand.randrange(self.nb_actions) action[randQ] = 1 # flag selected action o, r, done, info = self.env.step( action) # progress env 1 frame episode_step += 1 '''---Process output of randomized actions without updating cumulative episode totals''' observation = deepcopy(o) observation, r = \ Flappy.process_step(observation, r, done, info) if info['status'] == 'exit': done = True did_abort = True if done: break # warmup period complete assert episode_reward is not None assert episode_step is not None assert observation is not None gc.collect() '''---Begin Iteratively Training Model Each Step * predict Q values / action (forward step) * use reward to improve the model (backward step) ''' FlappyCall.on_step_begin(episode_step) '''---Predict Q Values Using Forward Method''' with tf.device(gpu): idx = self.forward(observation) action, flap = Flappy.process_action(idx, self.nb_actions) #episode_step += 1 reward = np.float32(0) done = False for _ in range(self.action_repetition): o, r, d, i = self.env.step(action) observation = deepcopy(o) observation, r = Flappy.process_step(o, r, d, i) reward += r done = d info = i status = info['status'] episode_step += 1 if info['status'] == 'exit': done = True did_abort = True if done: break # game over, end episode '''---Train the Model using Backward Method This function covers the bulk of the algorithm logic * store experience in memory * create experience batch, and predict Qs * train model on signle batch with selected optimizer * enable/disable double DQN or dueling network * update model target values * discount future reward and return model metrics ''' with tf.device(gpu): metrics = self.backward(reward, terminal=done) episode_reward += reward self.reward = episode_reward episode_score = info['score'] '''---Log Step Data---''' step_log = { 'step': episode_step, # track episode step nb 'episode': episode, 'metrics': metrics, 'flap': flap, 'action': action, 'reward': reward, 'done': done, 'training': self.training, 'q_values': self.q_values, 'info': info, 'x': o, 'x_t': observation, } FlappyCall.on_step_end(episode_step, step_log) gc.collect() #episode_step += 1 self.step += 1 if (self.step % self.save_interval) == 0 \ or status == 'save': self.save_model() if status == 'exit': done = True did_abort = True if self.nb_max_episode_steps and \ (episode_step >= self.nb_max_episode_steps - 1): done = True # max episode steps hit # We are in a terminal state but the agent hasn't yet seen it. # perform one more forward-backward call and ignore the action if done: with tf.device(gpu): self.forward(observation) self.backward(0., terminal=False) episode_log = { 'sess_id': self.sess_id, 'episode': episode, 'reward': episode_reward, 'score': episode_score, 'steps': episode_step, # track global step nb 'gif': self.keep_gif_score, 'log_path': self.logs, 'iteration': iteration, } '''Episode Complete, Proceed to Next Iteration''' FlappyCall.on_episode_end(episode, episode_log) episode += 1 observation = None episode_step = None episode_reward = None episode_score = None gc.collect() if episode > self.nb_episodes or did_abort: done = True # max episode hit break '''---Training Session Complete---''' self.save_model() session_log = { 'id': self.sess_id, 'nb_steps': self.step, 'did_abort': did_abort } FlappyCall.on_train_end(session_log, self.sess_id, self.log_path) self._on_train_end() # end training session if iteration >= max_iteration or did_abort: self.env.close() return True def forward(self, observation): # Select an action state = self.memory.get_recent_state(observation) with tf.device(gpu): self.q_values = self.compute_q_values(state) if self.training: # LinearAnneal Greedy Epsilon with tf.device(gpu): action = self.policy.select_action(q_values=self.q_values) else: # GreedyQ with tf.device(gpu): action = self.test_policy.select_action(q_values=self.q_values) # Book-keeping for experience replay self.recent_observation = observation self.recent_action = action return action def backward(self, reward, terminal): '''Store latest step in experience replay tuple''' if self.step % self.memory_interval == 0 or self.reward > .011: if self.reward > .011: self.util.display_status( 'Step {} Replay Experience Memory Saved'.format(self.step)) with tf.device(cpu): self.memory.append(np.array(self.recent_observation), np.int16(self.recent_action), np.float32(reward), terminal, training=self.training) metrics = [] if not self.training: return metrics '''Begin Training on Batches of Stored Experiences''' if self.step > self.nb_steps_warmup \ and self.step % self.train_interval == 0: with tf.device(gpu): batch = self.memory.sample(self.batch_size) assert len(batch) == self.batch_size state0_batch, reward_batch,action_batch, terminal1_batch, \ state1_batch = \ FlappyProcessor.process_state_batch(self, batch) assert reward_batch.shape == (self.batch_size, ) assert terminal1_batch.shape == reward_batch.shape assert len(action_batch) == len(reward_batch) '''Compute the Q-Values for Mini-Batch of Samples "Deep Reinforcement Learning with Double Q-learning" (van Hasselt et al., 2015): Double DQN: - online network predicts actions - target network estimates Q values. ''' if self.enable_double_dqn: with tf.device(gpu): q_values = self.model.predict_on_batch(state1_batch) assert q_values.shape == (self.batch_size, self.nb_actions) actions = np.argmax(q_values, axis=1) assert actions.shape == (self.batch_size, ) # estimate Q values using the target network # select maxQ value with the online model (computed above) with tf.device(gpu): target_q_values = \ self.target_model.predict_on_batch(state1_batch) assert target_q_values.shape == \ (self.batch_size, self.nb_actions) q_batch = target_q_values[range(self.batch_size), actions] # Compute the q_values for state1, compute maxQ of each sample # prediction done on target_model as outlined in Mnih (2015), # it makes the algorithm is significantly more stable else: with tf.device(gpu): target_q_values = \ self.target_model.predict_on_batch(state1_batch) assert target_q_values.shape == \ (self.batch_size, self.nb_actions) q_batch = np.max(target_q_values, axis=1).flatten() assert q_batch.shape == (self.batch_size, ) targets = np.zeros((self.batch_size, self.nb_actions)) dummy_targets = np.zeros((self.batch_size, )) masks = np.zeros((self.batch_size, self.nb_actions)) # Compute r_t + gamma * max_a Q(s_t+1, a) # update the affected output targets accordingly # Set discounted reward to zero for all states that were terminal discounted_reward_batch = self.gamma * q_batch discounted_reward_batch *= terminal1_batch assert discounted_reward_batch.shape == reward_batch.shape Rs = reward_batch + discounted_reward_batch for idx, (target, mask, R, action) in enumerate( zip(targets, masks, Rs, action_batch)): target[action] = R # update with estimated accumulated reward dummy_targets[idx] = R mask[action] = 1. # enable loss for specific action targets = np.array(targets).astype('float32') masks = np.array(masks).astype('float32') '''Train Using Sample Experience Batch''' # perform a single update on the entire batch # use a dummy target, as loss is computed complex Lambda layer # still useful to know the target to compute metrics properly if type(self.model.input) is not list: ins = [state0_batch] else: state0_batch if self.validate: split = self.split else: split = 0 with tf.device(gpu): metrics = self.trainable_model.train_on_batch( ins + [targets, masks], [dummy_targets, targets]) # THIS CAUSES A MEMORY LEAK IN CURRENT CONFIGURATION #metrics = self.trainable_model.fit( # ins + [targets, masks], # [dummy_targets, targets], # batch_size=None, # epochs=self.epochs, # verbose=self.verbose, # validation_split=split, # shuffle=self.shuffle #) gc.collect() # throw away individual losses if type(metrics) is list: [m for idx, m in enumerate(metrics) if idx not in (1, 2)] else: metrics.history.update({'losses': self.policy.metrics}) if self.target_model_update >= 1 \ and self.step % self.target_model_update == 0: with tf.device(gpu): self.update_target_model_hard() return metrics def save_model(self): if self.save_full: '''---Save full model to single .h5 file---''' self.model.save(self.save_path + '_full.h5', overwrite=True) self.util.display_status('{} Model Saved to {}'.format( self.name, self.save_path + '_full.h5')) if self.save_weights: '''---Save model weights to separate .h5 file---''' self.model.save_weights(self.save_path + '_weights.h5', overwrite=True) self.util.display_status('{} Model Weights Saved to {}'.format( self.name, self.save_path + '_weights.h5')) if self.save_json: '''---Save model structure as JSON file---''' with open(self.save_path + '.json', 'a+') as f: json.dumps(self.model.to_json(), f) f.close() self.util.display_status('{} Model Structure Saved to {}'.format( self.name, self.save_path + '.json')) if self.save_plot: plot_model(self.model, to_file=self.save_path + '_flow.png') self.util.display_status( '{} Neural Network Diagram Saved to {}'.format( self.name, self.save_path + '_flow.png'))
def training_game(): env = Environment( map_name="HallucinIce", visualize=True, game_steps_per_episode=150, agent_interface_format=features.AgentInterfaceFormat( feature_dimensions=features.Dimensions(screen=64, minimap=32))) input_shape = (_SIZE, _SIZE, 1) nb_actions = _SIZE * _SIZE # Should this be an integer model = neural_network_model(input_shape, nb_actions) # memory : how many subsequent observations should be provided to the network? memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH) processor = SC2Proc() ### Policy # Agent´s behaviour function. How the agent pick actions # LinearAnnealedPolicy is a wrapper that transforms the policy into a linear incremental linear solution . Then why im not see LAP with other than not greedy ? # EpsGreedyQPolicy is a way of selecting random actions with uniform distributions from a set of actions . Select an action that can give max or min rewards # BolztmanQPolicy . Assumption that it follows a Boltzman distribution. gives the probability that a system will be in a certain state as a function of that state´s energy?? policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=1, value_min=0.7, value_test=.0, nb_steps=1e6) # policy = (BoltzmanQPolicy( tau=1., clip= (-500,500)) #clip defined in between -500 / 500 ### Agent # Double Q-learning ( combines Q-Learning with a deep Neural Network ) # Q Learning -- Bellman equation dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=500, target_model_update=1e-2, policy=policy, batch_size=150, processor=processor) dqn.compile(Adam(lr=.001), metrics=["mae"]) ## Save the parameters and upload them when needed name = "HallucinIce" w_file = "dqn_{}_weights.h5f".format(name) check_w_file = "train_w" + name + "_weights.h5f" if SAVE_MODEL: check_w_file = "train_w" + name + "_weights_{step}.h5f" log_file = "training_w_{}_log.json".format(name) callbacks = [ModelIntervalCheckpoint(check_w_file, interval=1000)] callbacks += [FileLogger(log_file, interval=100)] if LOAD_MODEL: dqn.load_weights(w_file) dqn.fit(env, callbacks=callbacks, nb_steps=1e7, action_repetition=2, log_interval=1e4, verbose=2) dqn.save_weights(w_file, overwrite=True) dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
def test1_dialogue_system(): """ Method for testing the GODialogueSys class for the movie booking data set """ # the path to the act set and load it # TODO: change it with a relative path act_set_file_path = '/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/data/dia_acts.txt' act_set = util.text_to_dict(act_set_file_path) # the path to the slot set and load it # TODO: change it with a relative path slot_set_file_path = '/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/data/slot_set.txt' slot_set = util.text_to_dict(slot_set_file_path) # the path to the user goals and load it # TODO: change it with a relative path goal_set_file_path = '/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/data/user_goals_first_turn_template.part.movie.v1.p' goal_set = util.load_goal_set(goal_set_file_path) # the list of initial inform slots init_inform_slots = ['moviename'] # the ultimate slot set ultimate_request_slot = 'ticket' kb_special_slots = ['numberofpeople'] kb_filter_slots = ['ticket', 'numberofpeople', 'taskcomplete', 'closing'] # feasible actions test_feasible_actions = test1_feasible_actions() # the agent memory agt_memory = SequentialMemory(limit=1000000, window_length=4) # the agent policy agt_policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # testing policy agt_test_policy = None # all system params params = {} params[const.MAX_NB_TURNS] = 30 params[ const.KB_PATH_KEY] = '/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/data/movie_kb.1k.p' # Environment params params[const.SIMULATION_MODE_KEY] = const.SEMANTIC_FRAME_SIMULATION_MODE params[const.IS_TRAINING_KEY] = True params[const.USER_TYPE_KEY] = const.RULE_BASED_USER params[const.STATE_TRACKER_TYPE_KEY] = const.RULE_BASED_STATE_TRACKER params[const.SUCCESS_REWARD_KEY] = 2 * params[const.MAX_NB_TURNS] params[const.FAILURE_REWARD_KEY] = - params[const.MAX_NB_TURNS] params[const.PER_TURN_REWARD_KEY] = -1 params[ const.NLU_PATH_KEY] = "/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/models/nlu/lstm_[1468447442.91]_39_80_0.921.p" params[ const.DIAACT_NL_PAIRS_PATH_KEY] = '/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/data/dia_act_nl_pairs.v6.json' params[ const.NLG_PATH_KEY] = "/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/models/nlg/lstm_tanh_relu_[1468202263.38]_2_0.610.p" # Agent params params[const.AGENT_TYPE_KEY] = const.AGENT_TYPE_DQN params[const.GAMMA_KEY] = .99 params[const.BATCH_SIZE_KEY] = 32 params[const.NB_STEPS_WARMUP_KEY] = 1000 params[const.TRAIN_INTERVAL_KEY] = 1 params[const.MEMORY_INTERVAL_KEY] = 1 params[const.TARGET_MODEL_UPDATE_KEY] = 10000 params[const.ENABLE_DOUBLE_DQN_KEY] = True params[const.ENABLE_DUELING_NETWORK_KEY] = False params[const.DUELING_TYPE_KEY] = 'avg' params[const.HIDDEN_SIZE_KEY] = 80 params[const.ACTIVATION_FUNCTION_KEY] = const.RELU # create the dialogue system dialogue_sys = GODialogSys(act_set=act_set, slot_set=slot_set, goal_set=goal_set, init_inform_slots=init_inform_slots, ultimate_request_slot=ultimate_request_slot, kb_special_slots=kb_special_slots, kb_filter_slots=kb_filter_slots, agt_feasible_actions=test_feasible_actions, agt_memory=agt_memory, agt_policy=agt_policy, agt_test_policy=agt_test_policy, params=params)
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=8000, window_length=1) # memory = EpisodeParameterMemory(limit=500000, window_length=1) # processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=0.65, value_min=0.05, value_test=.05, nb_steps=1000000) # policy = GreedyQPolicy() # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-batchstyle exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! processor = AutolabProcessor(nb_inputs=1) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy,
def setup(difficulty_level='default', env_name = "AirSimEnv-v42"): #parser = argparse.ArgumentParser() #parser.add_argument('--mode', choices=['train', 'test'], default='train') #parser.add_argument('--env-name', type=str, default='AirSimEnv-v42') #parser.add_argument('--weights', type=str, default=None) #parser.add_argument('--difficulty-level', type=str, default="default") #args = parser.parse_args() #args, unknown = parser.parse_known_args() config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.6 config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) # Get the environment and extract the number of actions. #msgs.algo = "DQN" env = gym.make(env_name) env.init_again(eval("settings."+difficulty_level+"_range_dic")) env.airgym.unreal_reset() #must rest so the env accomodate the changes time.sleep(5) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n WINDOW_LENGTH = 1 depth_shape = env.depth.shape vel_shape = env.velocity.shape dst_shape = env.position.shape # Keras-rl interprets an extra dimension at axis=0 # added on to our observations, so we need to take it into account img_kshape = (WINDOW_LENGTH,) + depth_shape # Sequential model for convolutional layers applied to image image_model = Sequential() if(settings.policy=='deep'): image_model.add(Conv2D(128,(3, 3), strides=(3, 3), padding='valid', activation='relu', input_shape=img_kshape, data_format="channels_first")) image_model.add(Conv2D(64, (3, 3), strides=(2, 2), padding='valid', activation='relu')) image_model.add(Conv2D(32, (3, 3), strides=(1, 1), padding='valid', activation='relu')) image_model.add(Conv2D(32, (1, 1), strides=(1, 1), padding='valid', activation='relu')) image_model.add(Flatten()) # plot_model(image_model, to_file="model_conv_depth.png", show_shapes=True) # Input and output of the Sequential model image_input = Input(img_kshape) encoded_image = image_model(image_input) # Inputs and reshaped tensors for concatenate after with the image velocity_input = Input((1,) + vel_shape) distance_input = Input((1,) + dst_shape) vel = Reshape(vel_shape)(velocity_input) dst = Reshape(dst_shape)(distance_input) # Concatenation of image, position, distance and geofence values. # 3 dense layers of 256 units denses = concatenate([encoded_image, vel, dst]) denses = Dense(1024, activation='relu')(denses) denses = Dense(1024, activation='relu')(denses) denses = Dense(512, activation='relu')(denses) denses = Dense(128, activation='relu')(denses) denses = Dense(64, activation='relu')(denses) else: image_model.add(Conv2D(32, (4, 4), strides=(4, 4), padding='valid', activation='relu', input_shape=img_kshape, data_format="channels_first")) image_model.add(Conv2D(64, (3, 3), strides=(2, 2), padding='valid', activation='relu')) image_model.add(Conv2D(128, (2, 2), strides=(1, 1), padding='valid', activation='relu')) image_model.add(Conv2D(64, (1, 1), strides=(1, 1), padding='valid', activation='relu')) image_model.add(Flatten()) # plot_model(image_model, to_file="model_conv_depth.png", show_shapes=True) # Input and output of the Sequential model image_input = Input(img_kshape) encoded_image = image_model(image_input) # Inputs and reshaped tensors for concatenate after with the image velocity_input = Input((1,) + vel_shape) distance_input = Input((1,) + dst_shape) vel = Reshape(vel_shape)(velocity_input) dst = Reshape(dst_shape)(distance_input) # Concatenation of image, position, distance and geofence values. # 3 dense layers of 256 units denses = concatenate([encoded_image, vel, dst]) denses = Dense(256, activation='relu')(denses) denses = Dense(256, activation='relu')(denses) denses = Dense(256, activation='relu')(denses) # Last dense layer with nb_actions for the output predictions = Dense(nb_actions, kernel_initializer='zeros', activation='linear')(denses) model = Model( inputs=[image_input, velocity_input, distance_input], outputs=predictions ) env.set_model(model) print(model.summary()) # plot_model(model,to_file="model.png", show_shapes=True) #train = True #train_checkpoint = False # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=WINDOW_LENGTH) # reduce memmory processor = MultiInputProcessor(nb_inputs=3) # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05c # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=0.0, nb_steps=100000) dqn = DQNAgent(model=model, processor=processor, nb_actions=nb_actions, memory=memory, nb_steps_warmup=settings.nb_steps_warmup, enable_double_dqn=settings.double_dqn, enable_dueling_network=False, dueling_type='avg', target_model_update=1e-2, policy=policy, gamma=.99) dqn.compile(Adam(lr=0.00025), metrics=['mae']) # Load the check-point weights and start training from there return dqn,env
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=1000, window_length=WINDOW_LENGTH) processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. nb_steps = 1000 if args.weights: nb_steps = 1 policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=.5, value_min=0, value_test=0, nb_steps=nb_steps) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! nb_steps_warmup = 100 if args.weights: nb_steps_warmup = 100 dqn = DQNAgent(model=model, nb_actions=nb_actions,
model.add(Dense(10)) model.add(Activation('tanh')) model.add(Dense(10)) model.add(Activation('tanh')) model.add(Dense(10)) model.add(Activation('tanh')) model.add(Dense(3)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(1000000, window_length=MEMORY_WINDOW_LENGTH) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=INTERVAL) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, nb_steps_warmup=INTERVAL, gamma=.99, target_model_update=INTERVAL, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks!
def __init__(self, inputs, buffer, sess_id, sess, **kwargs): self.util = Utility() self.sess = sess self.sess_id = sess_id game = inputs['game'] agnt = inputs['agent'] sess = agnt['session'] eps = sess['episode'] mod = inputs['model'] trn = mod['training'] sv = mod['save'] mem = inputs['memory'] '''---Environment Paramters---''' self.env_name = game['name'] self.fps = game['fps'] self.mode = game['difficulty'] self.target = game['target'] self.tick = game['tick'] '''---Episode Parameters---''' self.nb_episodes = sess['max_ep'] self.nb_max_episode_steps = game['fps'] * 60 * eps['max_time'] self.nb_steps = self.nb_max_episode_steps * self.nb_episodes self.nb_steps_warmup = trn['warmup'] self.nb_max_start_steps = trn['max_ep_observe'] self.max_start_steps = trn['warmup'] self.keep_gif_score = eps['keep_gif_score'] '''---Agent / Model Parameters---''' self.name = agnt['name'] self.nb_actions = agnt['action_size'] self.delta_clip = agnt['delta_clip'] self.training = trn['training'] self.verbose = trn['verbose'] self.lr = trn['learn_rate'] self.eps = trn['initial_epsilon'] self.value_max = trn['initial_epsilon'] self.value_min = trn['terminal_epsilon'] self.anneal = trn['anneal'] self.shuffle = trn['shuffle'] self.train_interval = trn['interval'] self.validate = trn['validate'] self.split = trn['split'] self.action_repetition = trn['action_repetition'] self.epochs = trn['epochs'] self.epoch = 1 prec = km.binary_precision() re = km.binary_recall() f1 = km.binary_f1_score() self.metrics = ['accuracy', 'mse', prec, re, f1] self.H = mod['filter_size'] self.alpha = mod['alpha'] self.gamma = mod['gamma'] self.momentum = mod['momentum'] self.decay = mod['decay'] self.target_model_update = mod['target_update'] self.type = mod['type'] self.enable_double_dqn = mod['double_dqn'] self.enable_dueling_network = mod['dueling_network'] self.dueling_type = mod['dueling_type'] self.limit = mem['limit'] self.batch_size = mem['batch_size'] self.window_length = mem['state_size'] self.memory_interval = mem['interval'] self.ftype = sv['ftype'] self.vizualize = sv['visualize'] self.save_full = sv['save_full'] self.save_weights = sv['save_weights'] self.save_json = sv['save_json'] self.save_plot = sv['save_plot'] self.save_interval = sv['save_n'] self.log_interval = sv['log_n'] self.saves = sv['save_path'] self.save_path = self.util.get_save_dir_struct(self.saves, self.env_name) self.logs = sv['log_path'] self.util.display_status('Hyperparameters Successfully Loaded') '''Reference/Excerpt: keras-rl DQN Atari Example https://github.com/keras-rl/keras-rl/blob/master/examples/dqn_atari.py # Select a policy. # We use eps-greedy action selection, which means that a random action # is selected with probability eps. We anneal eps from init to term over # the course of (anneal) steps. This is done so that the agent initially # explores the environment (high eps) and then gradually sticks to # what it knows (low eps). We also set a dedicated eps value that is # used during testing. Note that we set it to 0.05 so that the agent # still performs some random actions. # This ensures that the agent cannot get stuck. # ''' self.custom_model_objects = { 'S': self.window_length, 'A': self.nb_actions, 'H': self.H, 'lr': self.lr, 'name': self.name, 'batch_size': self.batch_size, 'sess': self.sess, #dueling_network=self.enable_dueling_network, #dueling_type=self.dueling_type, } with tf.device(gpu): self.policy = LinearAnnealedPolicy( inner_policy=EpsGreedyQPolicy(eps=self.value_max), attr='eps', value_max=self.value_max, value_min=self.value_min, value_test=self.alpha, nb_steps=self.anneal) self.test_policy = GreedyQPolicy() if mod['optimizer'].lower() == 'adamax': self.optimizer = Adamax(lr=self.lr) elif mod['optimizer'].lower() == 'adadelta': self.optimizer = Adadelta() elif mod['optimizer'].lower() == 'rmsprop': self.optimizer = RMSprop() elif mod['optimizer'].lower() == 'sgd': self.optimizer = SGD( lr=self.lr, momentum=self.momentum, decay=self.decay, ) else: self.optimizer = Adam(lr=self.lr) self.memory = buffer self.log_path = self.util.get_log_dir_struct(self.sess_id, self.logs, self.ftype) self.util.display_status('Keras GPU Session {} Beginning'.format( self.sess_id)) nn = NeuralNet( S=self.window_length, A=self.nb_actions, H=self.H, lr=self.lr, name=self.name, batch_size=self.batch_size, dueling_network=self.enable_dueling_network, dueling_type=self.dueling_type, sess=self.sess, ) with tf.device(gpu): self.model = nn.get_model() self.util.display_status( '{} Keras Agent with {} Optimizer Built'.format( self.name, mod['optimizer'])) '''---Compile the model with chosen optimizer loss is calculated with lamba function based on model type selections (dueling, or double dqn)''' with tf.device(gpu): self.compile( optimizer=self.optimizer, metrics=self.metrics, ) self.util.display_status( '{} Agent Fully Initialized with Compiled Model'.format(self.name)) super(BetaFlapDQN, self).__init__( model=self.model, nb_actions=self.nb_actions, memory=self.memory, policy=self.policy, test_policy=self.test_policy, enable_double_dqn=self.enable_double_dqn, enable_dueling_network=self.enable_dueling_network, dueling_type=self.dueling_type, **kwargs)
def training_game(): env = Environment( map_name="CollectMineralShards", visualize=True, game_steps_per_episode=150, agent_interface_format=features.AgentInterfaceFormat( feature_dimensions=features.Dimensions(screen=64, minimap=32))) input_shape = (_SIZE, _SIZE, 1) nb_actions = 12 # Number of actions model = neural_network_model(input_shape, nb_actions) memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH) processor = SC2Proc() # Policy policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=1, value_min=0.2, value_test=.0, nb_steps=1e2) # Agent dqn = DQNAgent( model=model, nb_actions=nb_actions, memory=memory, enable_double_dqn=True, enable_dueling_network=True, # 2019-07-12 GU Zhan (Sam) when value shape problem, reduce nb_steps_warmup: # nb_steps_warmup=300, target_model_update=1e-2, policy=policy, nb_steps_warmup=500, target_model_update=1e-2, policy=policy, batch_size=150, processor=processor, delta_clip=1) dqn.compile(Adam(lr=.001), metrics=["mae", "acc"]) # Tensorboard callback timestamp = f"{datetime.datetime.now():%Y-%m-%d %I:%M%p}" # 2019-07-12 GU Zhan (Sam) folder name for Lunux: # callbacks = keras.callbacks.TensorBoard(log_dir='./Graph/'+ timestamp, histogram_freq=0, # write_graph=True, write_images=False) # 2019-07-12 GU Zhan (Sam) folder name for Windows: callbacks = keras.callbacks.TensorBoard(log_dir='.\Graph\issgz', histogram_freq=0, write_graph=True, write_images=False) # Save the parameters and upload them when needed name = "agent" w_file = "dqn_{}_weights.h5f".format(name) check_w_file = "train_w" + name + "_weights.h5f" if SAVE_MODEL: check_w_file = "train_w" + name + "_weights_{step}.h5f" log_file = "training_w_{}_log.json".format(name) if LOAD_MODEL: dqn.load_weights(w_file) class Saver(Callback): def on_episode_end(self, episode, logs={}): if episode % 200 == 0: self.model.save_weights(w_file, overwrite=True) s = Saver() logs = FileLogger('DQN_Agent_log.csv', interval=1) dqn.fit(env, callbacks=[callbacks, s, logs], nb_steps=600, action_repetition=2, log_interval=1e4, verbose=2) dqn.save_weights(w_file, overwrite=True) dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
async def main(): env_player = SimpleRLPlayer(server_configuration=ServerConfiguration( "localhost:8000", "https://play.pokemonshowdown.com/action.php?"), ) #opponent = RandomPlayer(player_configuration=PlayerConfiguration("USCPokebot", "uscpokebot"), #server_configuration= ServerConfiguration("localhost:8000", #"https://play.pokemonshowdown.com/action.php?"),) #second_opponent = MaxDamagePlayer(battle_format="gen8randombattle") # Output dimension n_action = len(env_player.action_space) model = Sequential() model.add(Dense(128, activation="elu", input_shape=(1, 12))) # Our embedding have shape (1, 10), which affects our hidden layer # dimension and output dimension # Flattening resolve potential issues that would arise otherwise model.add(Flatten()) model.add(Dense(128, activation="elu")) model.add(Dense(128, activation="elu")) model.add(Dense(64, activation="elu")) model.add(Dense(n_action, activation="linear")) memory = SequentialMemory(limit=10000, window_length=1) # Ssimple epsilon greedy policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.05, value_test=0, nb_steps=10000, ) loaded_model = tf.keras.models.load_model('model_30000') loaded_model.load_weights('weights_DQN_30000.h5') # Defining our DQN dqn = DQNAgent( model=loaded_model, nb_actions=len(env_player.action_space), policy=policy, memory=memory, nb_steps_warmup=1000, gamma=0.5, target_model_update=1, delta_clip=0.01, enable_double_dqn=True, ) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) #model.load_weights('weights_DQN.h5') # Evaluation class EmbeddedRLPlayer(Player): def choose_move(self, battle): if np.random.rand() < 0.01: # avoids infinite loops return self.choose_random_move(battle) embedding = SimpleRLPlayer.embed_battle(self, battle) action = dqn.forward(embedding) return SimpleRLPlayer._action_to_move(self, action, battle) #player_configuration=PlayerConfiguration("USCPokebot", "uscpokebot"), emb_player = EmbeddedRLPlayer( player_configuration=PlayerConfiguration("CSCI527Bot", "CSCI527Bot"), server_configuration=ServerConfiguration( "sim.smogon.com:8000", "https://play.pokemonshowdown.com/action.php?"), ) await emb_player.ladder(50)
else: plot_class = None vqae = None if args.use_vqae: # Initialize VQAE vqae = Autoencoder(plot_class=plot_class) # Initialize processor processor = AtariProcessor(autoencoder=vqae, plot_class=plot_class) if args.agent_type == 'DDQN': # Setup exploration policy policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=opt.eps_value_max, value_min=opt.eps_value_min, value_test=opt.eps_value_test, nb_steps=opt.eps_decay_steps) if opt.use_quantized_observations: agent = TabularQAgent(num_states=opt.state_vector_length, num_actions=env.action_space.n, policy=policy, test_policy=policy, processor=processor) else: # Setup DQN agent if opt.recurrent: model = DRQN_Model(window_length=opt.dqn_window_length, num_actions=env.action_space.n) else: model = DQN_Model(window_length=opt.dqn_window_length,
def main(model_name, options): # Initialize maze environments. env = gym.make('Pong-v0') # env = gym.make('CartPole-v0') #env = gym.make('Taxi-v2') envs = [env] # Setting hyperparameters. nb_actions = env.action_space.n maze_dim = (6400, 1) h_size = 64 # For DQN e_t_size = 64 #For MQN / RMQN context_size = 64 nb_steps_warmup = int(1e5) nb_steps = int(4e15) buffer_size = 8e4 learning_rate = 0.003 target_model_update = 0.999 clipnorm = 10. switch_rate = 50 window_length = 12 memory_size = None # Callbacks log = TrainEpisodeLogger() #tensorboard = TensorBoard(log_dir="./logs/{}".format(model_name)) rl_tensorboard = RLTensorBoard(log_dir="./logs/{}".format(model_name), histogram_freq=100) callbacks = [log, rl_tensorboard] ### Models ### model = None target_model = None # MQN model. if "MQN" in options: memory_size = 12 model = MQNmodel(e_t_size, context_size, memory_size, window_length, nb_actions, maze_dim) target_model = MQNmodel(e_t_size, context_size, memory_size, window_length, nb_actions, maze_dim) # RMQN model. if "RMQN" in options: memory_size = 12 model = RMQNmodel(e_t_size, context_size, memory_size, window_length, nb_actions, maze_dim) target_model = RMQNmodel(e_t_size, context_size, memory_size, window_length, nb_actions, maze_dim) # Distributional MQN model. nb_atoms = 51 v_min = -2. v_max = 2. #model = DistributionalMQNModel(e_t_size, context_size, window_length, nb_actions, nb_atoms, obs_dimensions) #target_model = DistributionalMQNModel(e_t_size, context_size, window_length, nb_actions, nb_atoms, obs_dimensions) # DQN model if "DQN" in options: model = DQNmodel(nb_actions, window_length, h_size, maze_dim) target_model = DQNmodel(nb_actions, window_length, h_size, maze_dim) # Initialize our target model with the same weights as our model. target_model.set_weights(model.get_weights()) # Initialize memory buffer for DQN algorithm. experience = [ SequentialMemory(limit=int(buffer_size / len(envs)), window_length=window_length) for i in range(len(envs)) ] # Learning policy where we initially begin training our agent by making random moves # with a probability of 1, and linearly decrease that probability down to 0.1 over the # course of some arbitrary number of steps. (nb_steps) policy = LinearAnnealedPolicy(inner_policy=EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.1, value_test=0., nb_steps=1e5) # Optional processor. processor = PongProcessor() # processor = MazeProcessor() # Initialize and compile the DQN agent. dqn = DQNAgent(model=model, target_model=target_model, nb_actions=nb_actions, memory=experience, nb_steps_warmup=nb_steps_warmup, target_model_update=target_model_update, policy=policy, processor=processor, batch_size=8) #Initialize experimental Distributional DQN Agent ''' dqn = DistributionalDQNAgent( model=model, target_model=target_model, num_atoms=nb_atoms, v_min=v_min, v_max=v_max, nb_actions=nb_actions, memory=experience, nb_steps_warmup=nb_steps_warmup, target_model_update=target_model_update, policy=policy, #processor=processor, batch_size=32 ) ''' # Compile the agent to check for validity, build tensorflow graph, etc. dqn.compile(RMSprop(lr=learning_rate, clipnorm=clipnorm), metrics=["mae"]) # Weights will be loaded if weight file exists. if os.path.exists("data/{}/{}".format(model_name, model_name + ".h5")): dqn.load_weights("data/{}/{}".format(model_name, model_name + ".h5")) # Train DQN in environment. if "train" in options: dqn.fit(env, nb_steps=nb_steps, verbose=0, callbacks=callbacks) # Visualization / Logging Tools logmetrics(log, model_name) logHyperparameters(model_name, e_t_size=e_t_size, context_size=context_size, h_size=h_size, memory_size=memory_size, learning_rate=learning_rate, target_model_update=target_model_update, clipnorm=clipnorm, window_length=window_length, nb_atoms=nb_atoms, v_min=v_min, v_max=v_max) # Save weights. dqn.save_weights("data/{}/{}".format(model_name, model_name + ".h5")) # Test DQN in environment. if "test" in options: dqn.test(env, nb_episodes=100, visualize=True) #Debugging if "debug" in options: observation = env.reset() outputLayer(dqn.model, np.array(experience[0].sample(32)[0].state0)) #visualizeLayer(dqn.model, dqn.layers[1], observation) return
def __init__(self, *args, **kwargs): super(Tester, self).__init__(*args, **kwargs) self.agent_name = 'iqn' self.verbose = False if self.agent_name == 'iqn': self.nb_quantiles = 32 self.model = NetworkMLPDistributional( nb_inputs=10, nb_outputs=4, nb_hidden_layers=2, nb_hidden_neurons=100, nb_quantiles=self.nb_quantiles, nb_cos_embeddings=64, duel=True, prior=False, activation='relu', duel_type='avg', window_length=1).model self.policy = LinearAnnealedPolicy( DistributionalEpsGreedyPolicy(eps=None), attr='eps', value_max=1., value_min=0.1, value_test=.0, nb_steps=10000) self.test_policy = DistributionalEpsGreedyPolicy(eps=0) self.memory = SequentialMemory(limit=10000, window_length=1) self.agent = IQNAgent(model=self.model, policy=self.policy, test_policy=self.test_policy, enable_double_dqn=True, nb_samples_policy=self.nb_quantiles, nb_sampled_quantiles=self.nb_quantiles, cvar_eta=1, nb_actions=4, memory=self.memory, gamma=0.99, batch_size=48, nb_steps_warmup=1000, train_interval=1, memory_interval=1, target_model_update=1000, delta_clip=1) elif self.agent_name == 'dqn': self.model = NetworkMLP(nb_inputs=10, nb_outputs=4, nb_hidden_layers=2, nb_hidden_neurons=100, duel=True, prior=False, activation='relu', duel_type='avg', window_length=1).model self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=0.1, value_test=.0, nb_steps=10000) self.test_policy = EpsGreedyQPolicy(eps=0) self.memory = SequentialMemory(limit=10000, window_length=1) self.agent = DQNAgent(model=self.model, policy=self.policy, test_policy=self.test_policy, enable_double_dqn=True, nb_actions=4, memory=self.memory, gamma=0.99, batch_size=48, nb_steps_warmup=1000, train_interval=1, memory_interval=1, target_model_update=1000, delta_clip=1)
model.add(Activation('relu')) model.add(Flatten()) for _ in range(args.num_layers): model.add(Dense(args.num_units)) model.add(Activation('relu')) model.add(Dense(nb_actions * 2, bias_initializer=custom_initializer)) # mean and SD model.add(Activation('linear')) print(model.summary()) memory = SequentialMemory(limit=args.memory_size, window_length=WINDOW_LENGTH) processor = AtariProcessor() policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=args.eps_max, value_min=args.eps_min, value_test=.05, nb_steps=1000000) test_policy = EpsGreedyQPolicy(eps=0.05) if bool(args.double_dqn): print("DOUBLE DQN") if bool(args.dueling): print("DUELING NETWORK") adfq = ADFQAgent(model=model, nb_actions=nb_actions, policy=policy, test_policy=test_policy, memory=memory, processor=processor,
n_action = len(env_player.action_space) model = Sequential() model.add(Dense(128, activation="elu", input_shape=(1, 10))) model.add(Flatten()) model.add(Dense(64, activation="elu")) model.add(Dense(n_action, activation="linear")) memory = SequentialMemory(limit=10000, window_length=1) policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.05, value_test=0, nb_steps=10000, ) dqn = DQNAgent( model=model, nb_actions=len(env_player.action_space), policy=policy, memory=memory, nb_steps_warmup=1000, gamma=0.5, target_model_update=1, delta_clip=0.01, enable_double_dqn=True, )
env.seed(123) # Next, we build a very simple model. model = NETWORK(obs_shape, nb_actions) print(model.summary()) memory = SequentialMemory( limit=MEMORY_SIZE, window_length=1 ) policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr='eps', value_max=EPS_MAX, value_min=EPS_MIN, value_test=EPS_TEST, nb_steps=EPS_DECAY_STEPS ) dqn = DQNAgent( model=model, gamma=GAMMA, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000, target_model_update=TARGET_MODEL_UPDATE, policy=policy, test_policy=policy, enable_double_dqn=DOUBLE_DQN )