def __init__(self, model, policy=None, test_policy=None, varTH=1e-5, *args, **kwargs): super(ADFQAgent, self).__init__(*args, **kwargs) # Validate (important) input. if hasattr(model.output, '__len__') and len(model.output) > 1: raise ValueError( 'Model "{}" has more than one output. ADFQ expects a model that has a single output.' .format(model)) if model.output._keras_shape != (None, 2 * self.nb_actions): raise ValueError( 'Model output "{}" has invalid shape. ADFQ expects a model that has one dimension for each action, in this case {}.' .format(model.output, self.nb_actions)) # Related objects. self.model = model if policy is None: policy = EpsGreedyQPolicy() if test_policy is None: test_policy = GreedyQPolicy() self.policy = policy self.test_policy = test_policy self.varTH = np.float32(varTH) # State. self.reset_states()
def make_dqn_rl_agent(processor: Processor_56x5, nbr_layers=2, enable_dueling_network: bool = False, enable_double_dqn: bool = True): """ :param processor: :param nbr_layers: :param enable_dueling_network: :param enable_double_dqn: :return: """ model = processor.create_model(nbr_layers=nbr_layers) test_policy = GreedyQPolicy() memory = SequentialMemory(limit=50000, window_length=1) dqn_agent = DQNAgent(model=model, nb_actions=NBR_TICHU_ACTIONS, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, test_policy=test_policy, processor=processor, enable_dueling_network=enable_dueling_network, enable_double_dqn=enable_double_dqn) dqn_agent.compile(Adam(lr=1e-3), metrics=['mae']) return dqn_agent
def __init__(self, model, nb_actions, policy=None, test_policy=None, gamma=.99, nb_steps_warmup=10, train_interval=1, delta_clip=np.inf, *args, **kwargs): super(SarsaAgent, self).__init__(*args, **kwargs) # Do not use defaults in constructor because that would mean that each instance shares the same # policy. if policy is None: policy = EpsGreedyQPolicy() if test_policy is None: test_policy = GreedyQPolicy() self.model = model self.nb_actions = nb_actions self.policy = policy self.test_policy = test_policy self.gamma = gamma self.nb_steps_warmup = nb_steps_warmup self.train_interval = train_interval self.delta_clip = delta_clip self.compiled = False self.actions = None self.observations = None self.rewards = None
def build_agent(observation_space_shape, num_actions): # Experience replay WARMUP_STEPS = 1000 # Collect the first steps before start experience replay MEM_LIMIT = 1000 # Max number of steps to store MEM_WINDOW_LEN = 1 # Experience of lenght 1 (single step) # Target network TARGET_MODEL_UPD_RATE = 1e-2 # Update target network with this rate # Build network, exp. replay and policy model = build_model(observation_space_shape, num_actions) replay_memory = SequentialMemory(limit=MEM_LIMIT, window_length=MEM_WINDOW_LEN) policy = GreedyQPolicy() # Finally build the agent GAMMA = 1 #dqn = DQNAgent(model=model, gamma=GAMMA, nb_actions=num_actions, memory=replay, nb_steps_warmup=WARMUP_STEPS, # target_model_update=TARGET_MODEL_UPD_RATE, policy=policy) # dqn.compile(Adam(lr=1e-3), metrics=['mae']) reinforce = REINFORCE(model, replay_memory, GAMMA, batch_size=1, nb_steps_warmup=WARMUP_STEPS) reinforce.compile(optimizer='sgd', metrics=['mae']) return reinforce
def __init__(self, model, policy=None, test_policy=None, enable_double_dqn=False, enable_dueling_network=False, dueling_type='avg', *args, **kwargs): super(ADFQAgent, self).__init__(*args, **kwargs) # Validate (important) input. if hasattr(model.output, '__len__') and len(model.output) > 1: raise ValueError('Model "{}" has more than one output. ADFQN expects a model that has a single output.'.format(model)) if model.output._keras_shape != (None, self.nb_actions*2): raise ValueError('Model output "{}" has invalid shape. ADFQN expects a model that has one dimension for each action, in this case {}.'.format(model.output, self.nb_actions)) print("ADFQ") # Parameters. self.enable_double_dqn = enable_double_dqn self.enable_dueling_network = enable_dueling_network self.dueling_type = dueling_type if self.enable_dueling_network: # It is not Dueling Network, it is just separate network. # get the second last layer of the model, abandon the last layer NotImplementedError # Related objects. self.model = model if policy is None: policy = EpsGreedyQPolicy() if test_policy is None: test_policy = GreedyQPolicy() self.policy = policy self.test_policy = test_policy # State. self.reset_states()
def parse_policy(args) -> Policy: pol: Policy = EpsGreedyQPolicy() if args.policy == 'LinearAnnealedPolicy': pol = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=0.05, nb_steps=args.zeta_nb_steps) if args.policy == 'SoftmaxPolicy': pol = SoftmaxPolicy() if args.policy == 'EpsGreedyQPolicy': pol = EpsGreedyQPolicy() if args.policy == 'GreedyQPolicy': pol = GreedyQPolicy() if args.policy == 'BoltzmannQPolicy': pol = BoltzmannQPolicy() if args.policy == 'MaxBoltzmannQPolicy': pol = MaxBoltzmannQPolicy() if args.policy == 'BoltzmannGumbelQPolicy': pol = BoltzmannGumbelQPolicy() if args.policy == 'ZetaPolicy': pol = ZetaPolicy(zeta_nb_steps=args.zeta_nb_steps, eps=args.eps) return pol
def __init__(self, model, policy=None, test_policy=None, enable_double_dqn=False, enable_dueling_network=False, dueling_type='avg', *args, **kwargs): super(DQNAgent, self).__init__(*args, **kwargs) # Validate (important) input. if hasattr(model.output, '__shape__') and len(model.output.shape) > 2: raise ValueError( 'Model "{}" has more than one output. DQN expects a model that has a single output.'.format(model)) if model.output._keras_shape != (None, self.nb_actions): raise ValueError( 'Model output "{}" has invalid shape. DQN expects a model that has one dimension for each action, in this case {}.'.format( model.output, self.nb_actions)) # Parameters. self.enable_double_dqn = enable_double_dqn self.enable_dueling_network = enable_dueling_network self.dueling_type = dueling_type if self.enable_dueling_network: # get the second last layer of the model, abandon the last layer layer = model.layers[-2] nb_action = model.output._keras_shape[-1] # layer y has a shape (nb_action+1,) # y[:,0] represents V(s;theta) # y[:,1:] represents A(s,a;theta) y = Dense(nb_action + 1, activation='linear')(layer.output) # caculate the Q(s,a;theta) # dueling_type == 'avg' # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta))) # dueling_type == 'max' # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta))) # dueling_type == 'naive' # Q(s,a;theta) = V(s;theta) + A(s,a;theta) if self.dueling_type == 'avg': outputlayer = Lambda( lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.mean(a[:, 1:], axis=1, keepdims=True), output_shape=(nb_action,))(y) elif self.dueling_type == 'max': outputlayer = Lambda( lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.max(a[:, 1:], axis=1, keepdims=True), output_shape=(nb_action,))(y) elif self.dueling_type == 'naive': outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:], output_shape=(nb_action,))(y) else: assert False, "dueling_type must be one of {'avg','max','naive'}" model = Model(inputs=model.input, outputs=outputlayer) # Related objects. self.model = model if policy is None: policy = EpsGreedyQPolicy() if test_policy is None: test_policy = GreedyQPolicy() self.policy = policy self.test_policy = test_policy # State. self.reset_states()
def make_sarsa_rl_agent(processor: Processor_56x5, nbr_layers=2): model = processor.create_model(nbr_layers=nbr_layers) test_policy = GreedyQPolicy() sarsa_agent = SarsaAgent(model=model, nb_actions=NBR_TICHU_ACTIONS, nb_steps_warmup=10, gamma=0.99, test_policy=test_policy, processor=processor) sarsa_agent.compile(Adam(lr=1e-3), metrics=['mae']) return sarsa_agent
def __init__(self, model, policy=None, test_policy=None, enable_double_dqn=False, enable_dueling_network=False, dueling_type='avg', *args, **kwargs): super(DQNAgent, self).__init__(*args, **kwargs) if model.output._keras_shape != (None, self.nb_actions): raise ValueError( f'Model output "{model.output}" has invalid shape. Dqn expects ' + f'a model that has one dimension for each action, in this case {self.nb_actions}.' ) self.enable_double_dqn = enable_double_dqn self.enable_dueling_network = enable_dueling_network self.dueling_type = dueling_type if self.enable_dueling_network: layer = model.layers[-2] nb_action = model.output._keras_shape[-1] y = Dense(nb_action + 1, activation='linear')(layer.output) if self.dueling_type == 'avg': outputlayer = Lambda( lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K. mean(a[:, 1:], keepdims=True), output_shape=(nb_action, ))(y) elif self.dueling_type == 'max': outputlayer = Lambda( lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K. max(a[:, 1:], keepdims=True), output_shape=(nb_action, ))(y) elif self.dueling_type == 'naive': outputlayer = Lambda( lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:], output_shape=(nb_action, ))(y) else: assert False, "dueling_type must be one of {'avg','max','naive'}" model = Model(inputs=model.input, outputs=outputlayer) self.model = model if policy is None: policy = EpsGreedyQPolicy() if test_policy is None: test_policy = GreedyQPolicy() self.policy = policy self.test_policy = test_policy self.reset_states()
def compile_agent(self): # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! processor = DistopiaProcessor(self.num_blocks, self.num_actions) #memory = SequentialMemory(limit=50000, window_length=1) #policy = PatchedBoltzmannQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks) #test_policy = PatchedGreedyQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks) policy = BoltzmannQPolicy() test_policy = GreedyQPolicy() self.sarsa = SARSAAgent(model=self.model, processor=processor, nb_actions=self.nb_actions, nb_steps_warmup=1000, policy=policy, test_policy=test_policy, gamma=0.9) self.sarsa.compile(Adam(lr=1e-3), metrics=['mae'])
#model = networks.lstm_network(window_length, input_shape[0], nb_actions) #################################################################### memory = SequentialMemory(limit=memory_limit, window_length=window_length) #################################################################### policy = EpsGreedyQPolicy(eps=eps) policy = LinearAnnealedPolicy(policy, attr='eps', value_max=eps, value_min=0, value_test=0, nb_steps=nb_steps) test_policy = GreedyQPolicy() #################################################################### dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=window_length + batch_size, target_model_update=0.02, policy=policy, test_policy=test_policy, batch_size=batch_size, train_interval=train_interval, gamma=gamma) dqn.compile(Adam(lr=0.00025), metrics=['mae'])
def __init__(self, inputs, buffer, sess_id, sess, **kwargs): self.util = Utility() self.sess = sess self.sess_id = sess_id game = inputs['game'] agnt = inputs['agent'] sess = agnt['session'] eps = sess['episode'] mod = inputs['model'] trn = mod['training'] sv = mod['save'] mem = inputs['memory'] '''---Environment Paramters---''' self.env_name = game['name'] self.fps = game['fps'] self.mode = game['difficulty'] self.target = game['target'] self.tick = game['tick'] '''---Episode Parameters---''' self.nb_episodes = sess['max_ep'] self.nb_max_episode_steps = game['fps'] * 60 * eps['max_time'] self.nb_steps = self.nb_max_episode_steps * self.nb_episodes self.nb_steps_warmup = trn['warmup'] self.nb_max_start_steps = trn['max_ep_observe'] self.max_start_steps = trn['warmup'] self.keep_gif_score = eps['keep_gif_score'] '''---Agent / Model Parameters---''' self.name = agnt['name'] self.nb_actions = agnt['action_size'] self.delta_clip = agnt['delta_clip'] self.training = trn['training'] self.verbose = trn['verbose'] self.lr = trn['learn_rate'] self.eps = trn['initial_epsilon'] self.value_max = trn['initial_epsilon'] self.value_min = trn['terminal_epsilon'] self.anneal = trn['anneal'] self.shuffle = trn['shuffle'] self.train_interval = trn['interval'] self.validate = trn['validate'] self.split = trn['split'] self.action_repetition = trn['action_repetition'] self.epochs = trn['epochs'] self.epoch = 1 prec = km.binary_precision() re = km.binary_recall() f1 = km.binary_f1_score() self.metrics = ['accuracy', 'mse', prec, re, f1] self.H = mod['filter_size'] self.alpha = mod['alpha'] self.gamma = mod['gamma'] self.momentum = mod['momentum'] self.decay = mod['decay'] self.target_model_update = mod['target_update'] self.type = mod['type'] self.enable_double_dqn = mod['double_dqn'] self.enable_dueling_network = mod['dueling_network'] self.dueling_type = mod['dueling_type'] self.limit = mem['limit'] self.batch_size = mem['batch_size'] self.window_length = mem['state_size'] self.memory_interval = mem['interval'] self.ftype = sv['ftype'] self.vizualize = sv['visualize'] self.save_full = sv['save_full'] self.save_weights = sv['save_weights'] self.save_json = sv['save_json'] self.save_plot = sv['save_plot'] self.save_interval = sv['save_n'] self.log_interval = sv['log_n'] self.saves = sv['save_path'] self.save_path = self.util.get_save_dir_struct(self.saves, self.env_name) self.logs = sv['log_path'] self.util.display_status('Hyperparameters Successfully Loaded') '''Reference/Excerpt: keras-rl DQN Atari Example https://github.com/keras-rl/keras-rl/blob/master/examples/dqn_atari.py # Select a policy. # We use eps-greedy action selection, which means that a random action # is selected with probability eps. We anneal eps from init to term over # the course of (anneal) steps. This is done so that the agent initially # explores the environment (high eps) and then gradually sticks to # what it knows (low eps). We also set a dedicated eps value that is # used during testing. Note that we set it to 0.05 so that the agent # still performs some random actions. # This ensures that the agent cannot get stuck. # ''' self.custom_model_objects = { 'S': self.window_length, 'A': self.nb_actions, 'H': self.H, 'lr': self.lr, 'name': self.name, 'batch_size': self.batch_size, 'sess': self.sess, #dueling_network=self.enable_dueling_network, #dueling_type=self.dueling_type, } with tf.device(gpu): self.policy = LinearAnnealedPolicy( inner_policy=EpsGreedyQPolicy(eps=self.value_max), attr='eps', value_max=self.value_max, value_min=self.value_min, value_test=self.alpha, nb_steps=self.anneal) self.test_policy = GreedyQPolicy() if mod['optimizer'].lower() == 'adamax': self.optimizer = Adamax(lr=self.lr) elif mod['optimizer'].lower() == 'adadelta': self.optimizer = Adadelta() elif mod['optimizer'].lower() == 'rmsprop': self.optimizer = RMSprop() elif mod['optimizer'].lower() == 'sgd': self.optimizer = SGD( lr=self.lr, momentum=self.momentum, decay=self.decay, ) else: self.optimizer = Adam(lr=self.lr) self.memory = buffer self.log_path = self.util.get_log_dir_struct(self.sess_id, self.logs, self.ftype) self.util.display_status('Keras GPU Session {} Beginning'.format( self.sess_id)) nn = NeuralNet( S=self.window_length, A=self.nb_actions, H=self.H, lr=self.lr, name=self.name, batch_size=self.batch_size, dueling_network=self.enable_dueling_network, dueling_type=self.dueling_type, sess=self.sess, ) with tf.device(gpu): self.model = nn.get_model() self.util.display_status( '{} Keras Agent with {} Optimizer Built'.format( self.name, mod['optimizer'])) '''---Compile the model with chosen optimizer loss is calculated with lamba function based on model type selections (dueling, or double dqn)''' with tf.device(gpu): self.compile( optimizer=self.optimizer, metrics=self.metrics, ) self.util.display_status( '{} Agent Fully Initialized with Compiled Model'.format(self.name)) super(BetaFlapDQN, self).__init__( model=self.model, nb_actions=self.nb_actions, memory=self.memory, policy=self.policy, test_policy=self.test_policy, enable_double_dqn=self.enable_double_dqn, enable_dueling_network=self.enable_dueling_network, dueling_type=self.dueling_type, **kwargs)
def startDummy(env, Comm, tryHard=False): nb_actions = env.action_space.n layer0Size = 4096 layer1Size = 4096 layer2Size = 4096 layer3Size = 0 layer4Size = 0 layer5Size = 1 # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(layer0Size)) model.add(LeakyReLU(alpha=0.003)) model.add(Dense(layer1Size)) model.add(LeakyReLU(alpha=0.003)) model.add(Dense(layer2Size)) model.add(LeakyReLU(alpha=0.003)) model.add(Dense(nb_actions)) model.add(Activation('linear')) #A little diagnosis of the model summary print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=800000, window_length=1) policy = GreedyQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, policy=policy, enable_dueling_network=True) dqn.compile(nadam(lr=0.001), metrics=['mae']) #Load Previous training previousfileLength = 0 #Start traing # Ctrl + C. # We train and store load_file_number = 39 loadFile = "Larger_Memeory_BOARDSIZE_" + str(max_board_size) + "_DQN_LAYERS_" + str(layer0Size) + "_" + str(layer1Size) + "_" + str(layer2Size) + "_" + str(layer3Size) + "_" + str(layer4Size) + "_" + str(layer5Size) + "_SAVENUMBER_" + str(load_file_number) + ".h5f" dqn.load_weights(loadFile) while(True): data = None while data == None: data = Comm.getNewData() observation, notUsed, currSafeMoves, headButtSafeMoves, noStuckMoves, foodMoves = env.findObservation(data=data) action = dqn.forward(observation) if action == 0: moveChosen = 'left' if action == 1: moveChosen = 'right' if action == 2: moveChosen = 'up' if action == 3: moveChosen = 'down' if moveChosen not in currSafeMoves and len(currSafeMoves) > 0: moveChosen = choice(currSafeMoves) if moveChosen not in noStuckMoves and len(noStuckMoves) > 0: moveChosen = choice(noStuckMoves) if moveChosen not in headButtSafeMoves and len(headButtSafeMoves) > 0: moveChosen = choice(headButtSafeMoves) if moveChosen not in foodMoves and len(foodMoves) > 0: moveChosen = choice(foodMoves) Comm.giveNewMove(moveChosen)
def _build_dqn_agent(self, params): NB_ACTIONS = 7 # ---------------------------------------------------------------------------------------------------------------- inputShape = (params['width'], params['height'], 3) model = Sequential() model.add( Conv2D(16, (3, 3), input_shape=inputShape, padding='same', activation='relu')) model.add(Conv2D(32, (3, 3), padding='same', activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2), padding='same')) model.add(NoisyNetDense(16, activation='linear')) model.add(Flatten()) model.add(NoisyNetDense(NB_ACTIONS, activation='linear')) model.summary() # ---------------------------------------------------------------------------------------------------------------- # Memory replay if not params['prio_memory']: print("Using Sequential memory") memory = SequentialMemory(limit=params['mem_size'], window_length=1) else: print("Using Prioritized memory") params['lr'] = params['lr'] / 4 memory = PrioritizedMemory(limit=params['mem_size'], alpha=0.6, start_beta=0.5, end_beta=1.0, steps_annealed=params['annealing'], window_length=1) # Epsilon Greedy policy, linearly decreasing if not params['noisy_layer']: print("Using Annealed Eps Greedy policy") self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=params['eps'], value_min=params['eps_final'], value_test=0.0, nb_steps=params['annealing']) # Or Greedy policy in case of noisy layers else: print("Using Q Greedy policy (with noisy layer)") self.policy = GreedyQPolicy() # Keras DQN agent self._dqn = DQNAgent( model=model, nb_actions=NB_ACTIONS, policy=self.policy, memory=memory, batch_size=params['batch_size'], processor=WindowProcessor(), enable_double_dqn=True, enable_dueling_network=True, nb_steps_warmup=params['train_start'], gamma=params['discount'], target_model_update=1000, train_interval=1, delta_clip=1., custom_model_objects={"NoisyNetDense": NoisyNetDense}) self._dqn.compile(Adam(lr=params['lr']), metrics=['mae']) if params['load_file']: print("file loaded") self._dqn.load_weights(params['load_file'])
def __init__(self, model, turn_left_agent, go_straight_agent, turn_right_agent, policy=None, test_policy=None, enable_double_dqn=False, enable_dueling_network=False, dueling_type='avg', *args, **kwargs): super(DQNAgent4Hrl, self).__init__(*args, **kwargs) # Parameters. self.enable_double_dqn = enable_double_dqn self.enable_dueling_network = enable_dueling_network self.dueling_type = dueling_type if self.enable_dueling_network: # get the second last layer of the model, abandon the last layer layer = model.layers[-2] nb_action = model.output._keras_shape[-1] # layer y has a shape (nb_action+1,) # y[:,0] represents V(s;theta) # y[:,1:] represents A(s,a;theta) y = layers.Dense(nb_action + 1, activation='linear')(layer.output) # caculate the Q(s,a;theta) # dueling_type == 'avg' # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta))) # dueling_type == 'max' # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta))) # dueling_type == 'naive' # Q(s,a;theta) = V(s;theta) + A(s,a;theta) if self.dueling_type == 'avg': outputlayer = Lambda( lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:] - tf. reduce_mean(a[:, 1:], axis=1, keepdims=True), output_shape=(nb_action, ))(y) elif self.dueling_type == 'max': outputlayer = Lambda( lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:] - tf. reduce_max(a[:, 1:], axis=1, keepdims=True), output_shape=(nb_action, ))(y) elif self.dueling_type == 'naive': outputlayer = Lambda( lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:], output_shape=(nb_action, ))(y) else: assert False, "dueling_type must be one of {'avg','max','naive'}" model = Model(inputs=model.input, outputs=outputlayer) # Related objects. self.model = model if policy is None: policy = EpsGreedyQPolicy() if test_policy is None: test_policy = GreedyQPolicy() self.policy = policy self.test_policy = test_policy self.turn_left_agent = turn_left_agent self.go_straight_agent = go_straight_agent self.turn_right_agent = turn_right_agent # State. self.reset_states()
# ------------------------------------------------------------------------------------------- memory_file = os.path.join(variable_configs_folder, "memory.p") memory = pickle.load(open(memory_file, "rb")) model = build_convolutional_nn(all_configs["c_layers"], all_configs["ff_layers"], env.observation_space.shape, env.num_actions) policy = LinearAnnealedPolicy( EpsGreedyQPolicy(masked_greedy=all_configs["masked_greedy"]), attr='eps', value_max=all_configs["max_eps"], value_min=all_configs["final_eps"], value_test=0.0, nb_steps=all_configs["exploration_fraction"]) test_policy = GreedyQPolicy(masked_greedy=True) # ------------------------------------------------------------------------------------------ dqn = DQNAgent(model=model, nb_actions=env.num_actions, memory=memory, nb_steps_warmup=all_configs["learning_starts"], target_model_update=all_configs["target_network_update_freq"], policy=policy, test_policy=test_policy, gamma=all_configs["gamma"], enable_dueling_network=all_configs["dueling"]) dqn.compile(Adam(lr=all_configs["learning_rate"]))
x = Dense(30)(x) x = Activation('tanh')(x) x = Dense(20)(x) x = Activation('tanh')(x) x = Dense(nb_actions)(x) x = Activation('linear')(x) criticModel = Model(inputs=[action_input, observation_input], outputs=x) print(criticModel.summary()) #setup policy and memory memory = SequentialMemory(limit=50000, window_length=1) #setup agent, using defined keras model alog with the policy and actions from above #Discrete actions: policy = EpsGreedyQPolicy() testPolicy = GreedyQPolicy() #agent = DQNAgent(model=actorModel, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, policy=policy, test_policy=testPolicy) #continuous actions: random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) agent = DDPGAgent(actor=actorModel, critic=criticModel, nb_actions=nb_actions, memory=memory, nb_steps_warmup_actor=100, nb_steps_warmup_critic=100, critic_action_input=action_input, random_process=random_process)
class BetaFlapDQN(DQNAgent): def __init__(self, inputs, buffer, sess_id, sess, **kwargs): self.util = Utility() self.sess = sess self.sess_id = sess_id game = inputs['game'] agnt = inputs['agent'] sess = agnt['session'] eps = sess['episode'] mod = inputs['model'] trn = mod['training'] sv = mod['save'] mem = inputs['memory'] '''---Environment Paramters---''' self.env_name = game['name'] self.fps = game['fps'] self.mode = game['difficulty'] self.target = game['target'] self.tick = game['tick'] '''---Episode Parameters---''' self.nb_episodes = sess['max_ep'] self.nb_max_episode_steps = game['fps'] * 60 * eps['max_time'] self.nb_steps = self.nb_max_episode_steps * self.nb_episodes self.nb_steps_warmup = trn['warmup'] self.nb_max_start_steps = trn['max_ep_observe'] self.max_start_steps = trn['warmup'] self.keep_gif_score = eps['keep_gif_score'] '''---Agent / Model Parameters---''' self.name = agnt['name'] self.nb_actions = agnt['action_size'] self.delta_clip = agnt['delta_clip'] self.training = trn['training'] self.verbose = trn['verbose'] self.lr = trn['learn_rate'] self.eps = trn['initial_epsilon'] self.value_max = trn['initial_epsilon'] self.value_min = trn['terminal_epsilon'] self.anneal = trn['anneal'] self.shuffle = trn['shuffle'] self.train_interval = trn['interval'] self.validate = trn['validate'] self.split = trn['split'] self.action_repetition = trn['action_repetition'] self.epochs = trn['epochs'] self.epoch = 1 prec = km.binary_precision() re = km.binary_recall() f1 = km.binary_f1_score() self.metrics = ['accuracy', 'mse', prec, re, f1] self.H = mod['filter_size'] self.alpha = mod['alpha'] self.gamma = mod['gamma'] self.momentum = mod['momentum'] self.decay = mod['decay'] self.target_model_update = mod['target_update'] self.type = mod['type'] self.enable_double_dqn = mod['double_dqn'] self.enable_dueling_network = mod['dueling_network'] self.dueling_type = mod['dueling_type'] self.limit = mem['limit'] self.batch_size = mem['batch_size'] self.window_length = mem['state_size'] self.memory_interval = mem['interval'] self.ftype = sv['ftype'] self.vizualize = sv['visualize'] self.save_full = sv['save_full'] self.save_weights = sv['save_weights'] self.save_json = sv['save_json'] self.save_plot = sv['save_plot'] self.save_interval = sv['save_n'] self.log_interval = sv['log_n'] self.saves = sv['save_path'] self.save_path = self.util.get_save_dir_struct(self.saves, self.env_name) self.logs = sv['log_path'] self.util.display_status('Hyperparameters Successfully Loaded') '''Reference/Excerpt: keras-rl DQN Atari Example https://github.com/keras-rl/keras-rl/blob/master/examples/dqn_atari.py # Select a policy. # We use eps-greedy action selection, which means that a random action # is selected with probability eps. We anneal eps from init to term over # the course of (anneal) steps. This is done so that the agent initially # explores the environment (high eps) and then gradually sticks to # what it knows (low eps). We also set a dedicated eps value that is # used during testing. Note that we set it to 0.05 so that the agent # still performs some random actions. # This ensures that the agent cannot get stuck. # ''' self.custom_model_objects = { 'S': self.window_length, 'A': self.nb_actions, 'H': self.H, 'lr': self.lr, 'name': self.name, 'batch_size': self.batch_size, 'sess': self.sess, #dueling_network=self.enable_dueling_network, #dueling_type=self.dueling_type, } with tf.device(gpu): self.policy = LinearAnnealedPolicy( inner_policy=EpsGreedyQPolicy(eps=self.value_max), attr='eps', value_max=self.value_max, value_min=self.value_min, value_test=self.alpha, nb_steps=self.anneal) self.test_policy = GreedyQPolicy() if mod['optimizer'].lower() == 'adamax': self.optimizer = Adamax(lr=self.lr) elif mod['optimizer'].lower() == 'adadelta': self.optimizer = Adadelta() elif mod['optimizer'].lower() == 'rmsprop': self.optimizer = RMSprop() elif mod['optimizer'].lower() == 'sgd': self.optimizer = SGD( lr=self.lr, momentum=self.momentum, decay=self.decay, ) else: self.optimizer = Adam(lr=self.lr) self.memory = buffer self.log_path = self.util.get_log_dir_struct(self.sess_id, self.logs, self.ftype) self.util.display_status('Keras GPU Session {} Beginning'.format( self.sess_id)) nn = NeuralNet( S=self.window_length, A=self.nb_actions, H=self.H, lr=self.lr, name=self.name, batch_size=self.batch_size, dueling_network=self.enable_dueling_network, dueling_type=self.dueling_type, sess=self.sess, ) with tf.device(gpu): self.model = nn.get_model() self.util.display_status( '{} Keras Agent with {} Optimizer Built'.format( self.name, mod['optimizer'])) '''---Compile the model with chosen optimizer loss is calculated with lamba function based on model type selections (dueling, or double dqn)''' with tf.device(gpu): self.compile( optimizer=self.optimizer, metrics=self.metrics, ) self.util.display_status( '{} Agent Fully Initialized with Compiled Model'.format(self.name)) super(BetaFlapDQN, self).__init__( model=self.model, nb_actions=self.nb_actions, memory=self.memory, policy=self.policy, test_policy=self.test_policy, enable_double_dqn=self.enable_double_dqn, enable_dueling_network=self.enable_dueling_network, dueling_type=self.dueling_type, **kwargs) def load_saved_model_weights(self): try: self.model.load_weights('saved/FlappyBird_weights.h5') self.util.display_status('Saved Keras Model Weights Loaded') except: self.util.display_status('No Saved Keras Model Weights Found') def fit(self, iteration=1, max_iteration=1): self.load_saved_model_weights() with tf.device(gpu): self.env = Environment( target_score=self.target, difficulty=self.mode, fps=self.fps, tick=self.tick, ) self.util.display_status('{} Environment Emulation Initialized'.format( self.env_name)) if self.action_repetition < 1: raise ValueError( 'action_repetition must be >= 1, is {}'.\ format(self.action_repetition) ) '''---Define Custom Callbacks and Processors BetaFlap''' FlappyCall = FlappySession() Flappy = FlappyProcessor() '''---Flag Agent with as Training with on_train_begin()''' self._on_train_begin() FlappyCall.on_train_begin() self.training = True observation = None reward = None done = False info = None status = 'play' episode = np.int16(0) self.step = np.int16(0) action = np.int16(0) self.randQ = np.int16(0) self.reward = np.float16(0) idx = np.int16(0) flap = False episode_reward = None episode_score = None episode_step = None did_abort = False '''---Begin stepping through Episodes---''' # continue while global step is < max session steps while self.step < self.nb_steps: gc.collect() if observation is None: # new episode '''---Initialize Environment with No Action''' FlappyCall.on_episode_begin(episode) self.reset_states() # reset all episode tracking parameters reward = None done = False info = {} action = None episode_step = np.int16(0) episode_score = np.int16(0) episode_reward = np.float32(0) wake = np.zeros([self.nb_actions]) # [0, 0] wake[0] = 1 # [1, 0] --> don't flap o, r, done, info = self.env.step(wake) # progress env 1 frame observation, r = Flappy.process_step(o, r, done, info) assert observation is not None '''---Each episode, begin with n random actions/steps''' if self.nb_max_start_steps == 0: self.nb_random_start_steps = 0 else: self.nb_random_start_steps = \ np.random.randint(self.nb_max_start_steps) '''---Perform random nb steps w/ rand action without adding them to experience replay memory''' for _ in range(self.nb_random_start_steps): action = np.zeros([self.nb_actions]) randQ = rand.randrange(self.nb_actions) action[randQ] = 1 # flag selected action o, r, done, info = self.env.step( action) # progress env 1 frame episode_step += 1 '''---Process output of randomized actions without updating cumulative episode totals''' observation = deepcopy(o) observation, r = \ Flappy.process_step(observation, r, done, info) if info['status'] == 'exit': done = True did_abort = True if done: break # warmup period complete assert episode_reward is not None assert episode_step is not None assert observation is not None gc.collect() '''---Begin Iteratively Training Model Each Step * predict Q values / action (forward step) * use reward to improve the model (backward step) ''' FlappyCall.on_step_begin(episode_step) '''---Predict Q Values Using Forward Method''' with tf.device(gpu): idx = self.forward(observation) action, flap = Flappy.process_action(idx, self.nb_actions) #episode_step += 1 reward = np.float32(0) done = False for _ in range(self.action_repetition): o, r, d, i = self.env.step(action) observation = deepcopy(o) observation, r = Flappy.process_step(o, r, d, i) reward += r done = d info = i status = info['status'] episode_step += 1 if info['status'] == 'exit': done = True did_abort = True if done: break # game over, end episode '''---Train the Model using Backward Method This function covers the bulk of the algorithm logic * store experience in memory * create experience batch, and predict Qs * train model on signle batch with selected optimizer * enable/disable double DQN or dueling network * update model target values * discount future reward and return model metrics ''' with tf.device(gpu): metrics = self.backward(reward, terminal=done) episode_reward += reward self.reward = episode_reward episode_score = info['score'] '''---Log Step Data---''' step_log = { 'step': episode_step, # track episode step nb 'episode': episode, 'metrics': metrics, 'flap': flap, 'action': action, 'reward': reward, 'done': done, 'training': self.training, 'q_values': self.q_values, 'info': info, 'x': o, 'x_t': observation, } FlappyCall.on_step_end(episode_step, step_log) gc.collect() #episode_step += 1 self.step += 1 if (self.step % self.save_interval) == 0 \ or status == 'save': self.save_model() if status == 'exit': done = True did_abort = True if self.nb_max_episode_steps and \ (episode_step >= self.nb_max_episode_steps - 1): done = True # max episode steps hit # We are in a terminal state but the agent hasn't yet seen it. # perform one more forward-backward call and ignore the action if done: with tf.device(gpu): self.forward(observation) self.backward(0., terminal=False) episode_log = { 'sess_id': self.sess_id, 'episode': episode, 'reward': episode_reward, 'score': episode_score, 'steps': episode_step, # track global step nb 'gif': self.keep_gif_score, 'log_path': self.logs, 'iteration': iteration, } '''Episode Complete, Proceed to Next Iteration''' FlappyCall.on_episode_end(episode, episode_log) episode += 1 observation = None episode_step = None episode_reward = None episode_score = None gc.collect() if episode > self.nb_episodes or did_abort: done = True # max episode hit break '''---Training Session Complete---''' self.save_model() session_log = { 'id': self.sess_id, 'nb_steps': self.step, 'did_abort': did_abort } FlappyCall.on_train_end(session_log, self.sess_id, self.log_path) self._on_train_end() # end training session if iteration >= max_iteration or did_abort: self.env.close() return True def forward(self, observation): # Select an action state = self.memory.get_recent_state(observation) with tf.device(gpu): self.q_values = self.compute_q_values(state) if self.training: # LinearAnneal Greedy Epsilon with tf.device(gpu): action = self.policy.select_action(q_values=self.q_values) else: # GreedyQ with tf.device(gpu): action = self.test_policy.select_action(q_values=self.q_values) # Book-keeping for experience replay self.recent_observation = observation self.recent_action = action return action def backward(self, reward, terminal): '''Store latest step in experience replay tuple''' if self.step % self.memory_interval == 0 or self.reward > .011: if self.reward > .011: self.util.display_status( 'Step {} Replay Experience Memory Saved'.format(self.step)) with tf.device(cpu): self.memory.append(np.array(self.recent_observation), np.int16(self.recent_action), np.float32(reward), terminal, training=self.training) metrics = [] if not self.training: return metrics '''Begin Training on Batches of Stored Experiences''' if self.step > self.nb_steps_warmup \ and self.step % self.train_interval == 0: with tf.device(gpu): batch = self.memory.sample(self.batch_size) assert len(batch) == self.batch_size state0_batch, reward_batch,action_batch, terminal1_batch, \ state1_batch = \ FlappyProcessor.process_state_batch(self, batch) assert reward_batch.shape == (self.batch_size, ) assert terminal1_batch.shape == reward_batch.shape assert len(action_batch) == len(reward_batch) '''Compute the Q-Values for Mini-Batch of Samples "Deep Reinforcement Learning with Double Q-learning" (van Hasselt et al., 2015): Double DQN: - online network predicts actions - target network estimates Q values. ''' if self.enable_double_dqn: with tf.device(gpu): q_values = self.model.predict_on_batch(state1_batch) assert q_values.shape == (self.batch_size, self.nb_actions) actions = np.argmax(q_values, axis=1) assert actions.shape == (self.batch_size, ) # estimate Q values using the target network # select maxQ value with the online model (computed above) with tf.device(gpu): target_q_values = \ self.target_model.predict_on_batch(state1_batch) assert target_q_values.shape == \ (self.batch_size, self.nb_actions) q_batch = target_q_values[range(self.batch_size), actions] # Compute the q_values for state1, compute maxQ of each sample # prediction done on target_model as outlined in Mnih (2015), # it makes the algorithm is significantly more stable else: with tf.device(gpu): target_q_values = \ self.target_model.predict_on_batch(state1_batch) assert target_q_values.shape == \ (self.batch_size, self.nb_actions) q_batch = np.max(target_q_values, axis=1).flatten() assert q_batch.shape == (self.batch_size, ) targets = np.zeros((self.batch_size, self.nb_actions)) dummy_targets = np.zeros((self.batch_size, )) masks = np.zeros((self.batch_size, self.nb_actions)) # Compute r_t + gamma * max_a Q(s_t+1, a) # update the affected output targets accordingly # Set discounted reward to zero for all states that were terminal discounted_reward_batch = self.gamma * q_batch discounted_reward_batch *= terminal1_batch assert discounted_reward_batch.shape == reward_batch.shape Rs = reward_batch + discounted_reward_batch for idx, (target, mask, R, action) in enumerate( zip(targets, masks, Rs, action_batch)): target[action] = R # update with estimated accumulated reward dummy_targets[idx] = R mask[action] = 1. # enable loss for specific action targets = np.array(targets).astype('float32') masks = np.array(masks).astype('float32') '''Train Using Sample Experience Batch''' # perform a single update on the entire batch # use a dummy target, as loss is computed complex Lambda layer # still useful to know the target to compute metrics properly if type(self.model.input) is not list: ins = [state0_batch] else: state0_batch if self.validate: split = self.split else: split = 0 with tf.device(gpu): metrics = self.trainable_model.train_on_batch( ins + [targets, masks], [dummy_targets, targets]) # THIS CAUSES A MEMORY LEAK IN CURRENT CONFIGURATION #metrics = self.trainable_model.fit( # ins + [targets, masks], # [dummy_targets, targets], # batch_size=None, # epochs=self.epochs, # verbose=self.verbose, # validation_split=split, # shuffle=self.shuffle #) gc.collect() # throw away individual losses if type(metrics) is list: [m for idx, m in enumerate(metrics) if idx not in (1, 2)] else: metrics.history.update({'losses': self.policy.metrics}) if self.target_model_update >= 1 \ and self.step % self.target_model_update == 0: with tf.device(gpu): self.update_target_model_hard() return metrics def save_model(self): if self.save_full: '''---Save full model to single .h5 file---''' self.model.save(self.save_path + '_full.h5', overwrite=True) self.util.display_status('{} Model Saved to {}'.format( self.name, self.save_path + '_full.h5')) if self.save_weights: '''---Save model weights to separate .h5 file---''' self.model.save_weights(self.save_path + '_weights.h5', overwrite=True) self.util.display_status('{} Model Weights Saved to {}'.format( self.name, self.save_path + '_weights.h5')) if self.save_json: '''---Save model structure as JSON file---''' with open(self.save_path + '.json', 'a+') as f: json.dumps(self.model.to_json(), f) f.close() self.util.display_status('{} Model Structure Saved to {}'.format( self.name, self.save_path + '.json')) if self.save_plot: plot_model(self.model, to_file=self.save_path + '_flow.png') self.util.display_status( '{} Neural Network Diagram Saved to {}'.format( self.name, self.save_path + '_flow.png'))
def __init__(self, model=None, policy=GreedyQPolicy()): """Initialisation of the groomer.""" self.model = model self.policy = policy
def _build(self, model_fn, nb_states, memory=None, policy=None, test_policy=None, enable_double_dqn=True, enable_dueling_network=False, dueling_type='avg', gamma=0.99, inputs_batch_size=None, nb_steps_warmup=1000, train_interval=1, memory_interval=1, target_update=10000, delta_range=None, delta_clip=np.inf, scope="dqn", model_scope="model", target_model_scope="target_model", optimizer=tf.train.AdamOptimizer, inputs=None, inputs_class=DQNInputs, memory_max_length=100000, learning_rate=0.001, eps=0.1): self.memory = memory if memory else ExperienceReplay( 4, max_length=memory_max_length) self.gamma = gamma self.nb_steps_warmup = nb_steps_warmup self.train_interval = train_interval self.memory_interval = memory_interval self.target_update = target_update self.delta_range = delta_range self.delta_clip = delta_clip self.inputs = inputs_class( nb_states, inputs_batch_size) if inputs is None else inputs self.global_step_update = self.global_step.assign_add(1) self.policy = policy(self) if policy else EpsGreedyQPolicy(self, eps=eps) self.test_policy = test_policy( self) if test_policy else GreedyQPolicy() with tf.variable_scope(model_scope): self.model = model_fn(self.inputs) self.model_variables = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=model_scope) with tf.variable_scope(target_model_scope): self.target_model = model_fn(self.inputs) self.target_model_variables = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=target_model_scope) with tf.variable_scope(scope): self.target_model_target = tf.where( self.inputs.done, self.inputs.r, self.inputs.r + self.gamma * tf.reduce_max(self.target_model.Qs, axis=1)) self.model_Qsa = select_columns( self.model.Qs, self.inputs.a) if not hasattr( self.model, "Qsa") else self.model.Qsa self.model_error = self.target_model_target - self.model_Qsa if not hasattr( self.model, "error") else self.model.error self.model_loss = (tf.reduce_mean)( (huber_loss)(self.model_error)) if not hasattr( self.model, "loss") else self.model.loss self.model_learning_rate = self.model.learning_rate if hasattr( self.model, 'learning_rate') else learning_rate self.update = optimizer(self.model_learning_rate).minimize( self.model_loss, var_list=self.model_variables) if not hasattr( self.model, "update") else self.model.update if self.target_update < 1: self.update = tf.group( self.update, *[ tv.assign_add(self.target_update * (mv - tv)) for mv, tv in zip(self.target_model_variables, self.model_variables) ]) self.update_target_hard = None else: self.update_target_hard = tf.group(*[ tv.assign(mv) for mv, tv in zip( self.target_model_variables, self.model_variables) ])
from keras.layers import Dense, Activation, Flatten from keras.optimizers import Adam from rl.policy import EpsGreedyQPolicy, GreedyQPolicy, BoltzmannQPolicy from rl.agents.dqn import DQNAgent from rl.memory import EpisodeParameterMemory, SequentialMemory sys.path.append(".") from patternmatching.gray.incremental.query_call import load_graph, parse_args from patternmatching.gray.incremental.rl_model import GraphEnv logging.basicConfig(level=logging.INFO) policies = { "bqp": BoltzmannQPolicy(), # Unstable "gqp": GreedyQPolicy(), "egqp": EpsGreedyQPolicy(eps=0.1) # eps should be around 0.1 } window_length = 5 # Should be less than 20 (too large value will not converge Q-values) memories = { "epm": EpisodeParameterMemory(limit=20, window_length=window_length), # Non-episodic "sm": SequentialMemory(limit=20, window_length=window_length) # should use this } argv = sys.argv if len(argv) < 4: print("Usage: python %s [ConfFile] [Policy] [Memory]" % argv[0]) exit(1)
import keras as K from keras import layers from keras.optimizers import Adam import numpy as np from PIL import Image from rl.core import Processor from rl.agents.dqn import DQNAgent from rl.memory import SequentialMemory from rl.policy import GreedyQPolicy create_q_model = __import__('train').create_q_model AtariProcessor = __import__('train').AtariProcessor if __name__ == '__main__': """ To run this on calling the method """ env = gym.make('BreakoutNoFrameskip-v4') state = env.reset() actions = env.action_space.n model = K.models.load_model('policy.h5') memory = SequentialMemory(limit=1000000, window_length=4) policy = GreedyQPolicy() process = AtariProcessor() dqn = DQNAgent(model=model, nb_actions=actions, memory=memory, policy=policy, processor=process) dqn.compile(optimizer=Adam(lr=.00025, clipnorm=1.0), metrics=['mae']) dqn.test(env, nb_episodes=10, visualize=True)
def dqndef(): # Get the environment and extract the number of actions. env = gym.make(args.env_name) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n # Next, we build our model. We use the same model that was # described by Mnih et al. (2015). input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE model = Sequential() if K.image_dim_ordering() == 'tf': # (width, height, channels) model.add(Permute((2, 3, 1), input_shape=input_shape)) elif K.image_dim_ordering() == 'th': # (channels, width, height) model.add(Permute((1, 2, 3), input_shape=input_shape)) else: raise RuntimeError('Unknown image_dim_ordering.') model.add(Conv2D(32, (8, 8), strides=(4, 4))) model.add(Activation('relu')) model.add(Conv2D(64, (4, 4), strides=(2, 2))) model.add(Activation('relu')) model.add(Conv2D(64, (3, 3), strides=(1, 1))) model.add(Activation('relu')) model.add(Flatten()) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # print(model.output_shape) # Finally, we configure and compile our agent. You can use # every built-in Keras optimizer and even the metrics! memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that # a random action is selected with probability eps. # We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done # so that the agent initially explores the environment (high eps) and # then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. # Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures # that the agent cannot get stuck. policy = GreedyQPolicy() # The trade-off between exploration and exploitation is difficult # and an on-going research topic. # If you want, you can experiment with the parameters or use a # different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', # value_max=1., value_min=.1, # value_test=.05, b_steps=1000000) # Feel free to give it a try! # print(model.output_shape) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) return dqn, env, args