def __init__(self, settings): assert(type(settings)==dict) # seed self.rng = settings['RNG'] # Epsilon self.epsilon_start = settings['EPSILON_START'] self.epsilon_end = settings['EPSILON_END'] self.epsilon_end_time = settings['EPSILON_END_TIME'] self.testing_epsilon = settings['TESTING_EPSILON'] self.epsilon_decay = (self.epsilon_start-self.epsilon_end)/float(self.epsilon_end_time) # Training self.learning_rate = settings['LEARNING_RATE'] self.rmsprop_rho = settings['RMSPROP_RHO'] self.rmsprop_epsilon = settings['RMSPROP_EPSILON'] self.target_net_update = settings['TARGET_NET_UPDATE'] self.min_reward = settings['MIN_REWARD'] self.max_reward = settings['MAX_REWARD'] # Q-Learning Parameters self.n_actions = settings['N_ACTIONS'] self.discount_factor = settings['DISCOUNT_FACTOR'] self.update_frequency = settings['UPDATE_FREQUENCY'] self.learn_start = settings['LEARN_START'] self.agent_history_length = settings['AGENT_HISTORY_LENGTH'] self.batch_size = settings['BATCH_SIZE'] # Preprocess self.resize_width = settings['RESIZE_WIDTH'] self.resize_height = settings['RESIZE_HEIGHT'] self.resize_dims = (self.resize_width, self.resize_height) self.net = DeepQNetwork( self.n_actions, self.agent_history_length, self.resize_height, self.resize_width ) self.target_net = DeepQNetwork( self.n_actions, self.agent_history_length, self.resize_height, self.resize_width ) self.target_net.setWeights(self.net.getWeights()) self.memory = ReplayMemory(settings) self.numSteps = 0 self.lastState = None self.lastAction = None self.lastTerminal = None self.compile()
class NeuralQLearner: def __init__(self, settings): assert(type(settings)==dict) # seed self.rng = settings['RNG'] # Epsilon self.epsilon_start = settings['EPSILON_START'] self.epsilon_end = settings['EPSILON_END'] self.epsilon_end_time = settings['EPSILON_END_TIME'] self.testing_epsilon = settings['TESTING_EPSILON'] self.epsilon_decay = (self.epsilon_start-self.epsilon_end)/float(self.epsilon_end_time) # Training self.learning_rate = settings['LEARNING_RATE'] self.rmsprop_rho = settings['RMSPROP_RHO'] self.rmsprop_epsilon = settings['RMSPROP_EPSILON'] self.target_net_update = settings['TARGET_NET_UPDATE'] self.min_reward = settings['MIN_REWARD'] self.max_reward = settings['MAX_REWARD'] # Q-Learning Parameters self.n_actions = settings['N_ACTIONS'] self.discount_factor = settings['DISCOUNT_FACTOR'] self.update_frequency = settings['UPDATE_FREQUENCY'] self.learn_start = settings['LEARN_START'] self.agent_history_length = settings['AGENT_HISTORY_LENGTH'] self.batch_size = settings['BATCH_SIZE'] # Preprocess self.resize_width = settings['RESIZE_WIDTH'] self.resize_height = settings['RESIZE_HEIGHT'] self.resize_dims = (self.resize_width, self.resize_height) self.net = DeepQNetwork( self.n_actions, self.agent_history_length, self.resize_height, self.resize_width ) self.target_net = DeepQNetwork( self.n_actions, self.agent_history_length, self.resize_height, self.resize_width ) self.target_net.setWeights(self.net.getWeights()) self.memory = ReplayMemory(settings) self.numSteps = 0 self.lastState = None self.lastAction = None self.lastTerminal = None self.compile() def compile(self): input_shape = ( self.batch_size, self.agent_history_length, self.resize_height, self.resize_width ) pred_input_shape = ( 1, self.agent_history_length, self.resize_height, self.resize_width ) self.pred_input = shared(np.zeros(pred_input_shape, dtype=floatX)) self.net_input = shared(np.zeros(input_shape, dtype=floatX)) self.target_net_input = shared(np.zeros(input_shape, dtype=floatX)) self.shared_actions = shared(np.zeros((self.batch_size,), dtype='int32')) self.shared_rewards = shared(np.zeros((self.batch_size,), dtype='int32')) self.shared_terminals = shared(np.zeros((self.batch_size,), dtype=floatX)) actions = T.ivector() rewards = T.ivector() terminals = T.vector() targets = (rewards + (T.ones_like(terminals) - terminals) * self.discount_factor * T.max(self.target_net.qvalues, axis=1)) diff = targets - self.net.qvalues[T.arange(self.batch_size), actions] qp = T.minimum(abs(diff), 1.0) lp = abs(diff) - qp delta = 0.5 * qp ** 2 + lp cost = T.sum(delta) optimizer = RMSprop( cost, self.net.params, lr=self.learning_rate, rho=self.rmsprop_rho, epsilon=self.rmsprop_epsilon ) givens = { self.net.input: self.net_input, self.target_net.input: self.target_net_input, actions: self.shared_actions, rewards: self.shared_rewards, terminals: self.shared_terminals } self.train = function( inputs=[], outputs=cost, updates=optimizer.getUpdates(), givens=givens ) self.prediction = function( inputs=[], outputs=self.net.qvalues.flatten(1), givens={ self.net.input: self.pred_input } ) def preprocess(self, rawstate): return cv2.resize(rawstate, self.resize_dims, interpolation=cv2.INTER_LINEAR) def getEpsilon(self): current_epsilon = self.epsilon_start - (self.numSteps * self.epsilon_decay) return max(self.epsilon_end, current_epsilon) def qLearnMinibatch(self): s1, a, r, t, s2 = self.memory.sampleMinibatch() # borrow=True para que no se haga una copia del arreglo y sea mas rapido self.net_input.set_value(s1, borrow=True) self.shared_actions.set_value(a, borrow=True) self.shared_rewards.set_value(r, borrow=True) self.shared_terminals.set_value(t, borrow=True) self.target_net_input.set_value(s2, borrow=True) return self.train() def perceive(self, rawstate, reward, terminal, testing): state = self.preprocess(rawstate) reward = max(reward, self.min_reward) reward = min(reward, self.max_reward) self.memory.storeRecentState(state, terminal) if((not testing) and (self.lastState is not None)): self.memory.storeTransition(self.lastState, self.lastAction, reward, self.lastTerminal) actionIndex = 0 if(not terminal): actionIndex = self.eGreedy(testing) flag1 = (self.numSteps > self.learn_start) flag2 = (self.numSteps % self.update_frequency == 0) # Short-Circuit-Eval ... if((not testing) and flag1 and flag2): cost = self.qLearnMinibatch() if(self.numSteps % self.target_net_update == 0): self.target_net.setWeights(self.net.getWeights()) self.lastState = state self.lastAction = actionIndex self.lastTerminal = terminal if(not testing): self.numSteps += 1 return actionIndex def eGreedy(self, testing): epsilon = self.testing_epsilon if(testing) else self.getEpsilon() if(self.rng.uniform(0,1) < epsilon): return self.rng.randint(0, self.n_actions) else: return self.greedy() def greedy(self): curState = self.memory.getRecentState() curState = curState.reshape(1, curState.shape[0], curState.shape[1], curState.shape[2]) self.pred_input.set_value(curState, borrow=True) q = self.prediction() maxq = q[0] besta = [0] for a in xrange(1, self.n_actions): if(q[a] > maxq): maxq = q[a] besta = [a] elif(q[a] == maxq): besta.append(a) r = self.rng.randint(0, len(besta)) return besta[r]