class Agent(object): """ ゲームルールによらない汎用性を持たす action: パターンの数だけ保持 学習アルゴリズム: Q学習 a = getNextAction(s) lean(S,a,r,S_next) """ def __init__(self, numAction=4): self.action_paturn = range(numAction) self.learningObj = MultiLayerPerceptron(numInput=2, numHidden=5, numOutput=4, activate1="tanh", activate2="sigmoid") self.X = [] self.Y = [] self.learnFlg = True def displayQ(self): self.learningObj.displayQ() def setLearnFlg(self, b): self.learnFlg = b def learn(self, o, a, r, o_next): """Q学習 or NeuralNetworkを使って,Q値を学習""" dQs = self.learningObj.predict(o) qk = dQs[a] maxQ = np.max(dQs) dQs[a] = qk + ALPHA * (r + GAMMA * maxQ - qk) self.X.append(np.asarray(o)) self.Y.append(np.asarray(dQs)) if len(self.X) > 500: self.X.pop(0) self.Y.pop(0) err = self.learningObj.fit(np.copy(self.X), np.copy(self.Y), learning_rate=0.2, epochs=500) return err def getNextAction(self, o): Agent_row = o[0] Agent_col = o[1] # 最大Q値の行動選択, 観測(observe)から、NNでQ値(配列)を取得 Q_t = self.learningObj.predict(o) best_actions = [] max_Q = -1000000 for i in range(len(Q_t)): q = Q_t[i] if q > max_Q: max_Q = q best_actions = [ACTION[i]] elif q == max_Q: best_actions.append(ACTION[i]) # 行動選択(複数ある場合に選ぶ) a = np.random.choice(best_actions) # 非学習 if not self.learnFlg: return a # 学習中 # greedyの行動選択 if GREEDY_RATIO < random.random(): return a else: return np.random.choice([0, 1, 2, 3]) def getMaxQvalue(self, o): return np.max(self.learningObj.predict(o)) def get_Q_values(self, o): return self.learningObj.predict(o)
ALPHA = 0.9 GAMMA = 0.9 qk = Q[a] maxQ = np.max(Q) Q[a] = qk + ALPHA * (r + GAMMA * maxQ - qk) print 'Qs:',Q # 3. NeauralNetworkで学習する X.append(np.array(o)) Y.append(Q) if len(X) > 1000: X.pop(0) Y.pop(0) mpl.fit(np.asarray(X), np.asarray(Y)) # 4. Stateの初期化判定 if option == GOAL: goaled_number += 1 S = state.getInitState() else: S = S_next print '>> GOAL NUMBER :', goaled_number print '>> GOAL NUMBER :', goaled_number # print agent.displayQ()