def __init__(self, name, globalAC, config_a, config_c): self.name = name #self.globalAC = globalAC #self.globalAC.load_ckpt() self.AC = ACnet(name, globalAC, config_a, config_c) globalAC.load_ckpt() self.AC.pull_global() self.env = wrap()
def test(): from config_a3c import config_a, config_c ac = ACnet("Global_Net", None, config_a, config_c) # we only need its params ac.load_ckpt() env = wrap(game[0]) state, _, done, info = env.reset() while True: a0, a1, a2 = ac.choose_action([state], [info]) action = 1 if a0 == 0 else int(2 + a1 * scr_pixels + a2) state, reward, done, info = env.step(action) if done: state, _, done, info = env.reset()
def main(): global env env = wrap() state_shape = env.state_shape() action_shape = env.action_shape() from config import config dqn = Model(config, state_shape, action_shape, FLAGS.learning_rate, FLAGS.gamma, FLAGS.save_dir) mem = Memory(FLAGS.mem_size) explorer = Explorer(FLAGS.explore_step, 0.01, 1.0, action_shape[1][0], action_shape[1][1], action_shape[0], 0.2) done = True step = dqn.get_step() while step < FLAGS.number_steps: if done: state, _, _, info = env.reset() #print(state.shape) q_map, q_other, step = dqn.start_infer(state) action = explorer.make_action(step, q_map, q_other, FLAGS.test) next_state, reward, done, info = env.step(action) mem.enqueue(state, action, reward, next_state, float(done)) state = next_state if FLAGS.test or (step < FLAGS.training_start) or ( step % FLAGS.learn_freq != 0): # print('action: %d, reward: %f, q_noop: %f, q_select: %f'%(action, reward, q_other[0], q_other[1])) continue # train inputs = mem.sample(FLAGS.batch_size) loss, step = dqn.start_train(inputs) print(q_other.shape) print( 'q_max: %f, q_min: %f, q_noop: %f, q_select: %f, action: %d, reward: %f' % (max(q_map.max(), q_other.max()), min(q_map.min(), q_other.min()), q_other[0][0], q_other[0][1], action, reward)) print('step: %d, loss: %f' % (step, loss)) if step % FLAGS.save_steps == 0: print('') print('model saved. step: %d' % step) dqn.save(step)
def main(): global env env = wrap() state_shape = env.state_shape() action_shape = env.action_shape() from config import config dqn = Model(config, state_shape, action_shape, FLAGS.learning_rate, FLAGS.gamma, FLAGS.save_dir) explorer = Explorer(FLAGS.explore_step, 0.01, 1.0, action_shape[1][0], action_shape[1][1], action_shape[0], 0.2) done = True step = dqn.get_step() while step < FLAGS.number_steps: if done: state, _, _, info = env.reset() act_rem = None #print(state.shape) q_map, q_other, step = dqn.start_infer(state) action = act_rem if act_rem != None else explorer.make_action(step, q_map, q_other, FLAGS.test) next_state, reward, done, info = env.step(action) next_q_map, next_q_other, step = dqn.predict_infer(state) act_rem = explorer.make_action(step, next_q_map, next_q_other, FLAGS.test) inputs = [[state],[action],[reward],[next_state],[act_rem],[done]] loss, step = dqn.start_train(inputs) print(q_other.shape) print('q_max: %f, q_min: %f, q_noop: %f, q_select: %f, action: %d, reward: %f' % ( max(q_map.max(), q_other.max()), min(q_map.min(), q_other.min()), q_other[0][0], q_other[0][1], action, reward)) print('step: %d, loss: %f' % (step, loss)) if step % FLAGS.save_steps == 0: print('') print('model saved. step: %d' % step) dqn.save(step)
def pre_train(self): global GLOBAL_RUNNING_R, GLOBAL_EP # self.AC.pull_global() total_step = 1 buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_r, buffer_avail,buffer_a0_exp,buffer_a1_exp,buffer_a2_exp = [], [], [], [], [], [],[],[],[] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: state, _, _, info = self.env.reset( ) # timestep[0] contains rewards, observations, etc. SEE pysc2 FOR MORE INFO ep_r = 0 lei_ji = 0 while True: a0, a1, a2 = self.AC.choose_action([state], [info]) a0_exp, a1_exp, a2_exp = teacher.action(state, info) # print(state) action = 1 if a0 == 0 else int(2 + a1_exp * scr_pixels + a2_exp) buffer_s.append([state]) buffer_avail.append([info]) buffer_a0.append(a0) buffer_a1.append(a1) buffer_a2.append(a2) buffer_a0_exp.append(a0_exp) buffer_a1_exp.append(a1_exp) buffer_a2_exp.append(a2_exp) state, reward, done, info = self.env.step(action) lei_ji += reward if lei_ji >= 20: done = True if reward > 0: reward = reward * (1 + ep_r * weight) buffer_r.append(reward) ep_r += reward if total_step % UPDATE_GLOBAL_ITER == 0 or done: if done: v_s_ = 0 else: v_s_ = sess.run(self.AC.value, {self.AC.s: [state]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ # compute v target buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_v_target, buffer_avail, buffer_a0_exp, buffer_a1_exp, buffer_a2_exp = np.vstack( buffer_s ), np.vstack(buffer_a0), np.vstack(buffer_a1), np.vstack( buffer_a2), np.vstack(buffer_v_target), np.vstack( buffer_avail), np.vstack(buffer_a0_exp), np.vstack( buffer_a1_exp), np.vstack( buffer_a2_exp ) # put together into a single array feed_dict = { self.AC.s: buffer_s, self.AC.a0: buffer_a0, self.AC.a1: buffer_a1, self.AC.a2: buffer_a2, self.AC.a0_exp: buffer_a0_exp, self.AC.a1_exp: buffer_a1_exp, self.AC.a2_exp: buffer_a2_exp, self.AC.v_target: buffer_v_target, self.AC.available: buffer_avail, } test = self.AC.update_global_high( feed_dict) # update parameters #closs ,aloss,exp_loss= sess.run([self.AC.c_loss,self.AC.a_loss,self.AC.exp_loss], feed_dict=feed_dict) #print("c_loss:",closs,"a_loss:",aloss,"exp_loss",exp_loss) #sigma_1,sigma_2 = sess.run([self.AC.sigma_1,self.AC.sigma_2],feed_dict = feed_dict) entropy, aloss, td, exp_loss, prob_a = sess.run( [ self.AC.entropy, self.AC.a_loss, self.AC.td, self.AC.exp_loss, self.AC.log_prob_a ], feed_dict=feed_dict) buffer_s, buffer_a0, buffer_a1, buffer_a2, buffer_r, buffer_avail = [], [], [], [], [], [] buffer_a0_exp, buffer_a1_exp, buffer_a2_exp = [], [], [] self.AC.pull_global() total_step += 1 if done: if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r) print( self.name, "episode:", GLOBAL_EP, '| reward: %.1f' % lei_ji, "| running_reward: %.1f" % GLOBAL_RUNNING_R[-1], # '| sigma:', test, # debug ) GLOBAL_EP += 1 print("entropy", entropy[0][0], "td", td[0], "prob_a:", prob_a, "prob_exp:", exp_loss, "aloss", aloss) # self.globalAC.save_ckpt() # with open("/summary.txt",'w') as f: # f.write('%.lf' % ep_r) if ep_r > score_high[self.hard] or ep_r < score_low[ self.hard]: self.env.close() self.hard = self.hard + 1 if ep_r > score_high[ self.hard] else self.hard - 1 self.env = wrap(game[self.hard]) break
def __init__(self, name, globalAC, config_a, config_c): self.env = wrap(game) self.globalAC = globalAC self.name = name self.AC = ACnet(name, globalAC, config_a, config_c)
def __init__(self): self.env = wrap()