def load_model(filename): model = DQN() checkpoint = torch.load(filename) model.load_state_dict(checkpoint['state_dict']) return model
def test(episodes=20, agent=None, load_path=None, ifrender=False, log=False): if log: logger.configure(dir="./log/", format_strs="stdout") if agent is None: agent = DQN(num_state=16, num_action=4) if load_path: agent.load(load_path) else: agent.load() env = Game2048Env() score_list = [] highest_list = [] for i in range(episodes): state, _, done, info = env.reset() state = log2_shaping(state) start = time.time() while True: action = agent.select_action(state, deterministic=True) next_state, _, done, info = env.step(action) next_state = log2_shaping(next_state) state = next_state if ifrender: env.render() if done: print(env.Matrix) if log: logger.logkv('episode number', i + 1) logger.logkv('episode reward', info['score']) logger.logkv('episode steps', info['steps']) logger.logkv('highest', info['highest']) logger.dumpkvs() break end = time.time() if log: print('episode time:{} s\n'.format(end - start)) score_list.append(info['score']) highest_list.append(info['highest']) print('mean score:{}, mean highest:{}'.format(np.mean(score_list), np.mean(highest_list))) print('max score:{}, max hightest:{}'.format(np.max(score_list), np.max(highest_list))) result_info = { 'mean': np.mean(score_list), 'max': np.max(score_list), 'list': score_list } print(highest_list) return result_info
def _Learning_init(self): #tag_velK,anzenK,uv_velK=self.tag_detector.getGain() ''' env = Environment(anzenK,uv_velK) self.QLagent = QLearningAgent(env) ''' #v1 self.env = Environment2() self.dqn_agent = DQN(self.env.observation_space, self.env.action_space) """#v2 self.env = Environment2_z() self.dqn_agent = DQNv2(self.env.observation_space, self.env.action_space) """ self._episode = -2 self._episode_reward = None self._episode_reward_count = 0 self.__episode_reward_store = [] #self.__episode_reward_store.append(self._episode_reward) self._Learning_reset()
def __init__(self, model = 0): self.movie_db = pickle.load(open('data/movie_db.pkl', 'rb'), encoding='latin1') self.movie_dict = pickle.load(open('data/movie_dict.pkl', 'rb'), encoding='latin1') self.user_goals = pickle.load(open('data/movie_user_goals.pkl', 'rb'), encoding='latin1') remove_empty_slots(self.movie_db) self.dst = DST(self.movie_db) self.emc = EMC(self.movie_dict) if model == 0: self.dqn = DQN(self.dst.state_size) elif model == 1: self.dqn = DDQN(self.dst.state_size) else: self.dqn = PerDQN(self.dst.state_size)
def train_dqn(episode, net): if not os.path.exists('./checkpoints'): os.mkdir('./checkpoints') loss = [] if net == "ddqn": agent = DDQN(env.action_space.n, env.observation_space.shape[0]) elif net == "dqn_fixed": agent = DQN_F(env.action_space.n, env.observation_space.shape[0]) else: agent = DQN(env.action_space.n, env.observation_space.shape[0]) current_time = datetime.datetime.now().strftime("%d%m%Y-%H%M%S") log_dir = 'logs/' + net + '/' + current_time summary_writer = tf.summary.create_file_writer(log_dir) for e in range(episode): state = env.reset() state = np.reshape(state, (1, agent.state_space)) score = 0 done = False i = 0 while not done: action = agent.act_greedy_policy(state) next_state, reward, done, _ = env.step(action) next_state = np.reshape(next_state, (1, agent.state_space)) score += reward agent.remember(state, action, reward, next_state, done) #env.render() state = next_state agent.experience_replay() i += 1 if i % 100 == 0 and not isinstance(agent, DQN): agent.copy_weights() # if you want to try a soft_update with the DDQN, substitute with agent.soft_update_weights() #agent.soft_update_weights() if done: print("Episode: {}/{}, score: {}, eps: {}".format( e + 1, episode, np.round(score, decimals=2), np.round(agent.epsilon, decimals=2))) break loss.append(score) # Average score of last 100 episode is_solved = np.mean(loss[-100:]) count_is_solved = 0 with summary_writer.as_default(): tf.summary.scalar('Episode reward', score, step=e) tf.summary.scalar('Avg reward (last 100):', is_solved, step=e) print("Average over last 100 episode: {0:.2f} \n".format(is_solved)) if is_solved >= 200: print('\n Task Completed! \n') if count_is_solved == 0: count_is_solved += 1 agent.dqn_network.model.save_weights('./checkpoints/' + net + '_' + current_time + '.h5') if isinstance(agent, DDQN) or isinstance(agent, DQN_F): agent.target_network.model.save_weights('./checkpoints/' + net + 'target' + '_' + current_time + '.h5') return loss
class LApriltags_ros(Apriltags_ros, object): contact_img_converter = True detected_flag = False frame = np.array([[0, 0], [Camera.image_size[0], Camera.image_size[1]]]) pure_frame_sizes = [] def __init__(self, bcc, set_learn=True): super(LApriltags_ros, self).__init__(callback=False) self.bcc = bcc #self.tag_detector = ApriltagsDetector() self._Learning_init() self.__detected = 0 self.__nodetected = 0 self.set_learn = set_learn self.__go_learn = False self.sub_tag = rospy.Subscriber('/tag_topic', AprilTagDetectionArray, self.LtagDetectedCallback) self.setup_time = self.begin = self.now = self.pre_time = time.time() self.TCR = TCR() self.count = 0 self.err_count = 0 self.detect_count = 0 self.__continuous_detect_count = 0 self.pure_frame_size = np.array([0, 0]) self.__pure_frame = np.array([[0, 0], [0, 0]]) def LtagDetectedCallback(self, msg): ##########################pre_measure############################ response = self.TCR.response() lefttop, rightbottom = LApriltags_ros.frame pure_lefttop, pure_rightbottom = self.__pure_frame ##########################measure2############################ """ recoder2.count += 1 recoder2.time = self.TCR.now() recoder2.response = response w = rightbottom[0] - lefttop[0] h = rightbottom[1] - lefttop[1] recoder2.pixel_w = w recoder2.pixel_h = h w,h = self.pure_frame_size recoder2.pure_pixel_w = w recoder2.pure_pixel_h = h try: recoder2.pos_x = self.tag_detector.getApriltag(0).pose[0][0] recoder2.pos_y = self.tag_detector.getApriltag(0).pose[1][0] recoder2.pos_z = self.tag_detector.getApriltag(0).pose[2][0] recoder2.speed_x = self.tag_detector.getApriltag(0).speed[0][0] recoder2.speed_y = self.tag_detector.getApriltag(0).speed[1][0] recoder2.speed_z = self.tag_detector.getApriltag(0).speed[2][0] recoder2.euler_x = self.tag_detector.getApriltag(0).euler[0] recoder2.euler_y = self.tag_detector.getApriltag(0).euler[1] recoder2.euler_z = self.tag_detector.getApriltag(0).euler[2] recoder2.speed_eulerx = self.tag_detector.getApriltag(0).d_euler[0] recoder2.speed_eulery = self.tag_detector.getApriltag(0).d_euler[1] recoder2.speed_eulerz = self.tag_detector.getApriltag(0).d_euler[2] except: recoder2.pos_x = 0 recoder2.pos_y = 0 recoder2.pos_z = 0 recoder2.speed_x = 0 recoder2.speed_y = 0 recoder2.speed_z = 0 recoder2.euler_x = 0 recoder2.euler_y = 0 recoder2.euler_z = 0 recoder2.speed_eulerx = 0 recoder2.speed_eulery = 0 recoder2.speed_eulerz = 0 recoder2.save() """ ##########################measure3############################ recoder_rl2.time = self.TCR.between() recoder_rl2.response = response tag_velK, anzenK, uv_velK = self.tag_detector.getGain() recoder_rl2.tag_velK = tag_velK recoder_rl2.anzenK = anzenK recoder_rl2.uv_velK = uv_velK recoder_rl2.bbox_lefttop_x = lefttop[0] recoder_rl2.bbox_lefttop_y = lefttop[1] recoder_rl2.bbox_rightbottom_x = rightbottom[0] recoder_rl2.bbox_rightbottom_y = rightbottom[1] recoder_rl2.pure_bbox_lefttop_x = pure_lefttop[0] recoder_rl2.pure_bbox_lefttop_y = pure_lefttop[1] recoder_rl2.pure_bbox_rightbottom_x = pure_rightbottom[0] recoder_rl2.pure_bbox_rightbottom_y = pure_rightbottom[1] recoder_rl2.episode = self._episode recoder_rl2.reward = self._episode_reward recoder_rl2.save() ###########################!--measure--######################### if (self.bcc.need_switch_fase()): recoder_rl.episode = self._episode recoder_rl.reward = self._episode_reward if not self._episode_reward_count == 0: recoder_rl.reward_ave = self._episode_reward / self._episode_reward_count if self._episode_learn_count == 0: recoder_rl.loss = None else: recoder_rl.loss = self._episode_loss / self._episode_learn_count recoder_rl.learn_count = self._episode_learn_count recoder_rl.save() """ recoder.to_csv() recoder2.to_csv() recoder.reset() recoder2.reset() """ #print(termcolor.colored("switch",'red')) self.TCR.reset() self._Learning_reset() self.__go_learn = False self.begin = time.time() self.count = 0 self.err_count = 0 self.detect_count = 0 self.__continuous_detect_count = 0 #LApriltags_ros.detected_flag = False """ > detect_t-1 > learning_t-1 > k_t-1 > frame_t > detect_t > learning_t > k_t > frame_t+1 > detect_t+1 > learning_t+1 > k_t+1 > frame_t+2 """ ##########################learning########################### if (self.set_learn and self.__go_learn): tag_velK, anzenK, uv_velK = self.tag_detector.getGain() #v1 k2,k3 = self._GainWithLearning((anzenK,uv_velK),LApriltags_ros.detected_flag,\ self.__pure_frame,LApriltags_ros.frame) """#v2 z = self.tag_detector.getApriltag(0).pose[2][0] k2,k3 = self._GainWithLearning((anzenK,uv_velK,z),LApriltags_ros.detected_flag,\ self.__pure_frame,LApriltags_ros.frame) """ self.tag_detector.setGain(anzenK=k2, uv_velK=k3) #######################!--learning--######################### ids = [] #LApriltags_ros.pure_frame_sizes.clear() LApriltags_ros.pure_frame_sizes = [] if len(msg.detections) > 0: self.__continuous_detect_count += 1 LApriltags_ros.detected_flag = True for i in range(len(msg.detections)): ids.append(msg.detections[i].id[0]) self.tag_detector.setApriltag(msg.detections[i]) if (self.__continuous_detect_count >= 2 and self.TCR.between() >= 1.0): self.__go_learn = True self.tag_detector.reset_tag_vels(ids) for i in range(len(msg.detections)): iid = msg.detections[i].id[0] self.pure_frame_size = self.tag_detector._getUvPureApriltagSize( iid) LApriltags_ros.frame, self.__pure_frame = self.tag_detector.getUvApriltag( msg.detections[i]) #LApriltags_ros.pure_frame_sizes.append(self.pure_frame_size) #LApriltags_ros.frames.append(Apriltags_ros.frame) #print(termcolor.colored("detect"+str(self.count),'blue')) else: self.__continuous_detect_count = 0 self.tag_detector.all_clear_tags() LApriltags_ros.detected_flag = False self.tag_detector.reset_tag_vels(ids) self.pure_frame_size = np.array([0, 0]) self.__pure_frame = np.array([[0, 0], [0, 0]]) LApriltags_ros.frame = np.array( [[0, 0], [Camera.image_size[0], Camera.image_size[1]]]) #LApriltags_ros.pure_frame_sizes.append(self.pure_frame_size) # LApriltags_ros.frames.append(Apriltags_ros.frame) self.err_count += 1 #print(termcolor.colored("nondetect"+str(self.err_count),'yellow')) self.count += 1 self.recode_count(LApriltags_ros.detected_flag) if not (LApriltags_ros.detected_flag): self.__go_learn = False def recode_count(self, detect_flag): recoder_rl.count = self.count recoder_rl2.count = self.count if (detect_flag): self.detect_count += 1 recoder_rl.detect_count = self.detect_count recoder_rl2.detect_count = self.detect_count self.__detected += 1 self.__nodetected = 0 else: self.__detected = 0 self.__nodetected += 1 def _GainWithLearning(self, state, detected_flag, pure_frame, frame): #v1 k2, k3 = self._Learning(state,detected_flag\ ,pure_frame,frame) """#v2 k2, k3,_ = self._Learning(state,detected_flag\ ,pure_frame,frame) """ return k2, k3 def _Learning_init(self): #tag_velK,anzenK,uv_velK=self.tag_detector.getGain() ''' env = Environment(anzenK,uv_velK) self.QLagent = QLearningAgent(env) ''' #v1 self.env = Environment2() self.dqn_agent = DQN(self.env.observation_space, self.env.action_space) """#v2 self.env = Environment2_z() self.dqn_agent = DQNv2(self.env.observation_space, self.env.action_space) """ self._episode = -2 self._episode_reward = None self._episode_reward_count = 0 self.__episode_reward_store = [] #self.__episode_reward_store.append(self._episode_reward) self._Learning_reset() def _Learning_reset(self): ''' self.QLagent.reset_episode() ''' self._episode += 1 if not self._episode_reward_count == 0: print(termcolor.colored("Episode: "+str(self._episode)\ +", reward_sum: "+str(self._episode_reward)\ +", reward_ave:"+str(self._episode_reward/self._episode_reward_count)\ ,'yellow')) self._episode_reward = 0. self._episode_reward_count = 0 self._episode_learn_count = 0 self._episode_loss = 0. s = self.env.reset() return s def _Learning(self, state, detect_flag, pure_frame, frame): s = state a = self.dqn_agent.choose_action(s) n_s, r, done, info = self.env.step(s, a) pure_lt, pure_rb = pure_frame lt, rb = frame #lefttop,rightbottom if(lt[0] < pure_lt[0]\ and pure_rb[0] < rb[0]\ and lt[1] < pure_lt[1]\ and pure_rb[1] < rb[1]\ ): pure_frame_in_frame = 1.0 else: pure_frame_in_frame = 0.0 pixel = (rb[0] - lt[0]) * (rb[1] - lt[1]) pure_pixel = (pure_rb[0] - pure_lt[0]) * (pure_rb[1] - pure_lt[1]) #v1 anzenK, uv_velK = n_s anzenK = anzenK**2 """#v2 anzenK,uv_velK,_ = n_s """ if detect_flag: """ r1 =pure_frame_in_frame*\ 100.* (1.-((pixel-anzenK*pure_pixel)/(pixel))) """ z = self.tag_detector.getApriltag(0).pose[2][0] if (anzenK * pure_pixel >= pixel): r1 = 1 else: r1 = 1 * 2 / (1 + np.exp(1 * (pixel - anzenK * pure_pixel) / (anzenK * pure_pixel))) print(r1) else: r1 = 0. r = r1 # + (r2 + r3) if done: self.env.reset() r = -10.0 r2 = -2. r3 = -2 else: minth, maxth = self.env._k1_threshold r2 = (maxth - anzenK) / (maxth - minth) minth, maxth = self.env._k2_threshold r3 = (maxth - uv_velK) / (maxth - minth) #if not detect_flag : r=-0.1 self.dqn_agent.store_transition(s, a, r, n_s) self._episode_reward += r self._episode_reward_count += 1 if (self.dqn_agent.memory_counter > self.dqn_agent._MEMORY_CAPACITY): self._episode_loss += self.dqn_agent.learn() self._episode_learn_count += 1 if done: #print("Ep : ",self._episode," r: ",self._episode_reward) ppp = 0 return n_s """ if(self.__detected >= count or self.__nodetected==1): return self.QLagent.learn(detect_flag\ , pure_pixel, pixel) #anzenk, uv_velk else: return [0,0] """ def plot_reward(self): plt.figure(1) plt.clf() sum_reward = torch.tensor(self.__episode_reward_store, dtype=torch.float) plt.title('Training...') plt.xlabel('Episode') plt.ylabel('reward') plt.plot(sum_reward.numpy()) if len(sum_reward) > 100.: means = sum_reward.unfold(0., 100., 1.).mean(1).view(-1) means = torch.cat((torch.zeros(99), means)) plt.plot(means.numpy()) plt.pause(0.01) if self.is_ipython: display.clear_output(wait=True) display.display(plt.gcf())
from flask import Flask, request, render_template, redirect, url_for, abort, jsonify, make_response import os import numpy as np from gym_2048 import Game2048Env from dqn_agent import DQN from utils import log2_shaping env = Game2048Env() app = Flask(__name__, static_folder="static", static_url_path="/static") agent = DQN(num_state=16, num_action=4) agent.load(path='./save/', name='dqn_9017.pkl') # @app.route('/1') # def hello_world(): # with open('./templates/index.html', encoding='utf-8')as f: # text = f.read() # return text @app.route('/') def hello_world2(): with open('./templates/py2048.html', encoding='utf-8') as f: text = f.read() return text @app.route('/move', methods=['POST']) def tile_move(): json = request.get_json()
to before, the action it took, the reward it got and the new state it reached at that time """ def __init__(self, size): self.size = size self.memory = deque(maxlen=self.size) def update(self, SARS): self.memory.append(SARS) def sample(self, batch_size): return zip(*random.sample(self.memory, batch_size)) r_memory = ReplayMemory(memory_size) agent = DQN(12, 12, 16) target = DQN(12, 12, 16) target.load_state_dict(agent.state_dict()) optimizer = Adam(agent.parameters()) def update_target(): if len(r_memory.memory) < batch_size: return observation, action, reward, observation_next, done = r_memory.sample( batch_size) observations = torch.cat(observation) observation_next = torch.cat(observation_next) actions = index_action(torch.LongTensor(action)) rewards = torch.LongTensor(reward) done = torch.FloatTensor(done)
def train(): episodes = train_episodes logger.configure(dir="./log/", format_strs="stdout,tensorboard,log") agent = DQN(num_state=16, num_action=4) env = Game2048Env() pf_saver = Perfomance_Saver() model_saver = Model_Saver(num=10) eval_max_score = 0 for i in range(episodes): state, reward, done, info = env.reset() state = log2_shaping(state) start = time.time() loss = None while True: if agent.buffer.memory_counter <= agent.memory_capacity: action = agent.select_action(state, random=True) else: action = agent.select_action(state) next_state, reward, done, info = env.step(action) next_state = log2_shaping(next_state) reward = log2_shaping(reward, divide=1) agent.store_transition(state, action, reward, next_state) state = next_state if ifrender: env.render() if agent.buffer.memory_counter % agent.train_interval == 0 and agent.buffer.memory_counter > agent.memory_capacity: # 相当于填满后才update loss = agent.update() if done: if i % log_interval == 0: if loss: logger.logkv('loss', loss) logger.logkv('training progress', (i+1) / episodes) logger.logkv('episode reward', info['score']) logger.logkv('episode steps', info['steps']) logger.logkv('highest', info['highest']) logger.logkv('epsilon', agent.epsilon) logger.dumpkvs() loss = None if i % epsilon_decay_interval == 0: # episilon decay agent.epsilon_decay(i, episodes) break end = time.time() print('episode time:{} s\n'.format(end - start)) # eval if i % eval_interval == 0 and i: eval_info = test(episodes=test_episodes, agent=agent) average_score, max_score, score_lis = eval_info['mean'], eval_info['max'], eval_info['list'] pf_saver.save(score_lis, info=f'episode:{i}') if int(average_score) > eval_max_score: eval_max_score = int(average_score) name = 'dqn_{}.pkl'.format(int(eval_max_score)) agent.save(name=name) model_saver.save("./save/" + name) logger.logkv('eval average score', average_score) logger.logkv('eval max socre', max_score) logger.dumpkvs()
loss = criterion(preds, Y_batch) loss.backward() optimizer.step() return loss.data[0] def save_checkpoint(state, filename): torch.save(state, filename) #def load_model(): #torch.load( if __name__ == '__main__': model = DQN() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) criterion = nn.CrossEntropyLoss() # Load data data = np.load('training_data.npy') X_train = np.stack(data[:, 0], axis=0).reshape( (len(data), 1, len(data[0][0]), len(data[0][0][0]))) Y_train = data[:, 3] # Training loop for epoch in range(100): # Randomize and batch training data batchsize = 8 # Randomly shuffle each epoch np.random.shuffle(X_train)