def trainNetwork(s, readout, h_fc1, sess): # 定义损失函数 a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # 开启游戏模拟器,会打开一个模拟器的窗口,实时显示游戏的信息 game_state = game.GameState() # 创建双端队列用于存放replay memory D = deque() # 获取游戏的初始状态,设置动作为空操作,并将初始状态修改成80*80*4大小 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # 用于加载或保存网络参数 saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # 开始训练 epsilon = INITIAL_EPSILON t = 0 # 这个是个死循环吧... while "flappy bird" != "angry bird": # 使用epsilon贪心策略选择一个动作 readout_t = readout.eval(feed_dict={s: [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 0 if t % FRAME_PER_ACTION == 0: # 执行一个随机动作 if random.random() <= epsilon: print("----------Random Action----------") action_index = random.randrange(ACTIONS) a_t[random.randrange(ACTIONS)] = 1 # 由神经网络计算的Q(s,a)值选择对应的动作 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 else: a_t[0] = 1 # 不执行跳跃动作 # 随游戏的进行,不断降低epsilon,减少随机动作 if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # 执行选择的动作,并获得下一状态及回报 x_t1_colored, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2) # 将状态转移过程存储到D中,用于更新参数时采样 D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # 过了观察期,才会进行网络参数的更新 if t > OBSERVE: # 从D中随机采样,用于参数更新 minibatch = random.sample(D, BATCH) # 分别将当前状态、采取的动作、获得的回报、下一状态分组存放 s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] # 计算Q(s,a)的新值 y_batch = [] readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch}) for i in range(0, len(minibatch)): terminal = minibatch[i][4] # 如果游戏结束,则只有反馈值 if terminal: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # 使用梯度下降更新网络参数 train_step.run(feed_dict={y: y_batch, a: a_batch, s: s_j_batch}) # 状态发生改变,用于下次循环 s_t = s_t1 t += 1 # 每进行10000次迭代,保留一下网络参数 if t % 10000 == 0: saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step=t) # 打印游戏信息 state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, \ "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t))
def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices=1) cost = tf.reduce_mean(tf.square(y - readout_action)) #train_step = tf.train.RMSPropOptimizer(0.00025, 0.95, 0.95, 0.01).minimize(cost) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D = [] # printing ''' a_file = open("logs_" + GAME + "/readout.txt", 'w') h_file = open("logs_" + GAME + "/hidden.txt", 'w') ''' # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # saving and loading networks saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path else: print "Could not find old network weights" epsilon = INITIAL_EPSILON t = 0 while "pigs" != "fly": # choose an action epsilon greedily readout_t = readout.eval(feed_dict={s: [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 0 if random.random() <= epsilon or t <= OBSERVE: action_index = random.randrange(ACTIONS) a_t[random.randrange(ACTIONS)] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE for i in range(0, K): # run the selected action and observe next state and reward x_t1_col, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:, :, 1:], axis=2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.pop(0) # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch}) for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run(feed_dict={y: y_batch, a: a_batch, s: s_j_batch}) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step=t) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max( readout_t) # write info to files '''
def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder( "float", [None, ACTIONS]) #placeholder actions. Vector que tendra el Q valor y = tf.placeholder( "float", [None]) # placeholder who really maxs la accion. Seraa el Q valor # readout, is the output of the network readout_action = tf.reduce_sum( tf.multiply(readout, a), reduction_indices=1 ) #Q valor de acciones * acciones elegidas -> Q valor de accions elegidas #Entrenas para que readout sea el valor Q cost = tf.reduce_mean(tf.square(y - readout_action)) #El coste es el cuadrado de la resta del valor Q (el valor de la accion, cuanto mayor mejor) de la verdadera, menos el valor Q de la que has elegido train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D = deque() # printing a_file = open("logs_" + GAME + "/readout.txt", 'w') h_file = open("logs_" + GAME + "/hidden.txt", 'w') # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[ 0] = 1 # Primera acion sera no hacer nada, que es el primer valor x_t, r_0, terminal = game_state.frame_step( do_nothing ) # Cuadno el das una accion al juego, te da el estado (el siguiente frame, la recompensa y si es terminal) x_t = cv2.cvtColor( cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) #x_T es el frame hecho resize y en blanco y negro ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # s_t es el stack de 4 frames que tienes que ar de input pero al princpio no tienes y tienes que pasarle 4 iguales # saving and loading networks saver = tf.train.Saver() tf.global_variables_initializer().run() checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path else: print "Could not find old network weights" start = time.time() epsilon = INITIAL_EPSILON t = 0 #timestep=frames while "pigs" != "fly": # choose an action epsilon greedily readout_t = readout.eval(feed_dict={s: [s_t]})[ 0] #evaluas la red sobre el primer input y te da el valor esprado a_t = np.zeros([ACTIONS]) #accion a tomar action_index = 0 if random.random() <= epsilon or t <= OBSERVE: # EXPLORAR action_index = random.randrange(ACTIONS) a_t[action_index] = 1 else: # EXPLOTACION action_index = np.argmax(readout_t) a_t[action_index] = 1 # Aqui esta a_t, la accion a tomar [0,0,1,0,0,0] cone xplotacion o exploracion segun el valor de epsilon # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE #si estamos en observacion, ir decremtnando epsilon cada vez mas for i in range( 0, K ): #Esto tendria que ser solo un paso, es aplicar K veces la accion seleccionada # run the selected action and observe next state and reward x_t1_col, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:, :, 0:3], axis=2) # AHORA TENEMOS NUEVO s_t1 PARA EL SIGUIENTE ESTADO # store the transition in D, DE LA CUAL SACARAS AL AZAR D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing SI ESTAS EXPLORANDO O ENTRENANADO, if t > OBSERVE: # sample a minibatch to train on (MINIBATCH DE LA MEMORIA) minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] # el s_t a_batch = [d[1] for d in minibatch] # el a_t (accion) r_batch = [d[2] for d in minibatch] # r_t :reward s_j1_batch = [d[3] for d in minibatch ] # s_t1 : resultado despues de ejecutarlo y_batch = [] #L readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch}) for i in range(0, len(minibatch)): # CALCULAR VALOR Y (RECOMPENSA) # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run( feed_dict={ y: y_batch, # si es estado final, y=recompensa del estado final. si no, y= recomepnsa de ese estado +GAMMA*recompensamaxima del siguiente estado a: a_batch, # Accion a tomar (0,0,0,1,0,0). La que ha dado el sampleo de minibaatch s: s_j_batch } ) #s: input images, las que ha dado directamente el sampleo de minibatch # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step=t) if t % 1000 == 0: print(time.time() - start) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print "TIMESTEP", t, "/ STATE", state, "/ LINES", game_state.total_lines, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max( readout_t) # write info to files '''