class CnnDqnAgent(object): policy_frozen = False epsilon_delta = 1.0 / 10 ** 4.4 min_eps = 0.1 actions = [0, 1, 2] cnn_feature_extractor = 'alexnet_feature_extractor.pickle' model = 'bvlc_alexnet.caffemodel' model_type = 'alexnet' image_feature_dim = 256 * 6 * 6 image_feature_count = 1 def _observation_to_featurevec(self, observation): # TODO clean if self.image_feature_count == 1: return np.r_[self.feature_extractor.feature(observation["image"][0]), observation["depth"][0]] elif self.image_feature_count == 4: return np.r_[self.feature_extractor.feature(observation["image"][0]), self.feature_extractor.feature(observation["image"][1]), self.feature_extractor.feature(observation["image"][2]), self.feature_extractor.feature(observation["image"][3]), observation["depth"][0], observation["depth"][1], observation["depth"][2], observation["depth"][3]] else: print("not supported: number of camera") def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.depth_image_dim = options['depth_image_dim'] self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load(open(self.cnn_feature_extractor)) print("done") else: self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim) def agent_start(self, observation): obs_array = self._observation_to_featurevec(observation) # Initialize State self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8) self.state[0] = obs_array state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Generate an Action e-greedy action, q_now = self.q_net.e_greedy(state_, self.epsilon) return_action = action # Update for next step self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() self.last_observation = obs_array return return_action def agent_step(self, reward, observation): obs_array = self._observation_to_featurevec(observation) #obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation if self.q_net.hist_size == 4: self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 2: self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 1: self.state = np.asanyarray([obs_array], dtype=np.uint8) else: print("self.DQN.hist_size err") state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Exploration decays along the time sequence if self.policy_frozen is False: # Learning ON/OFF if self.q_net.initial_exploration < self.time: self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon else: # Initial Exploation Phase print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)), eps = 1.0 else: # Evaluation print("Policy is Frozen") eps = 0.05 # Generate an Action by e-greedy action selection action, q_now = self.q_net.e_greedy(state_, eps) return action, eps, q_now, obs_array def agent_step_update(self, reward, action, eps, q_now, obs_array): # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Simple text based visualization if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) print('Step:%d Action:%d Reward:%.1f Epsilon:%.6f Q_max:%3f' % ( self.time, self.q_net.action_to_index(action), reward, eps, q_max)) # Updates for next step self.last_observation = obs_array if self.policy_frozen is False: self.last_action = copy.deepcopy(action) self.last_state = self.state.copy() self.time += 1 def agent_end(self, reward): # Episode Terminated print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon)) # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state, True) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Time count if self.policy_frozen is False: self.time += 1
class CnnDqnAgent(object): policy_frozen = False epsilon_delta = 1.0 / 10 ** 4.4# print '%.10f' %(1.0 / 10 ** 4.4) # 0.0000398107170553496878006617676337697275812388397753238677978515625 min_eps = 0.1 actions = [0, 1, 2] cnn_feature_extractor = 'alexnet_feature_extractor.pickle' model = 'bvlc_alexnet.caffemodel' model_type = 'alexnet' image_feature_dim = 256 * 6 * 6 image_feature_count = 1 actions_evaluate = deque(maxlen=4) #---------------------------------------------------------- def _observation_to_featurevec(self, observation): # TODO clean if self.image_feature_count == 1: #print observation["image"][0].shape, type(observation["image"][0])#会error因为不是np所以没有shape #print self.feature_extractor.feature(observation["image"][0]).shape#,\#返回的是1D的256*6*6 #observation["depth"][0].shape return np.r_[self.feature_extractor.feature(observation["image"][0])] #, observation["depth"][0]] elif self.image_feature_count == 4: return np.r_[self.feature_extractor.feature(observation["image"][0]), self.feature_extractor.feature(observation["image"][1]), self.feature_extractor.feature(observation["image"][2]), self.feature_extractor.feature(observation["image"][3])]# # observation["depth"][0], # observation["depth"][1], # observation["depth"][2], # observation["depth"][3]] else: print("not supported: number of camera") def agent_init(self, **options): self.use_gpu = options['use_gpu'] #self.depth_image_dim = options['depth_image_dim'] self.q_net_input_dim = self.image_feature_dim * self.image_feature_count #+ self.depth_image_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load(open(self.cnn_feature_extractor)) print("done") else: self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim) def agent_start(self, observation): obs_array = self._observation_to_featurevec(observation)#拿到前面去提取分析再r合并了 # Initialize State self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8) self.state[0] = obs_array state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Generate an Action e-greedy action, q_now = self.q_net.e_greedy(state_, self.epsilon)#return return_action1 return_action = action #return return_action2 print return_action, type(return_action)#------------------------------------------¥¥¥¥ Random 2 <type 'int'> # Update for next step self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy()#作为下个状态的开始 self.last_observation = obs_array #:::::::::::::::::::::::::::::::::::::::::::: return return_action #return return_action3 # action, q_now = self.q_net.e_greedy(state_, self.epsilon)-75 def agent_step(self, reward, observation): obs_array = self._observation_to_featurevec(observation)#拿到前面去提取分析再r_合并了 #obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation if self.q_net.hist_size == 4: self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 2: self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 1: self.state = np.asanyarray([obs_array], dtype=np.uint8) else: print("self.DQN.hist_size err") state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) #state_从self.state = np.asanyarray([obs_array], dtype=np.uint8)的uint8去小数变成shape(1,1,256*6*6)的float32 if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Exploration decays along the time sequence if self.policy_frozen is False: # Learning ON/OFF if self.q_net.initial_exploration < self.time:#10000<运行时间 self.epsilon -= self.epsilon_delta #那么开始渐渐减少eps if self.epsilon < self.min_eps: #如果eps已经被减少的快没了比预定的最小值还要小, self.epsilon = self.min_eps #则等于min_eps =0.1 eps = self.epsilon # self.epsilon = 1.0 ----61 理由是 if np.random.rand() < epsilon:q_net.py160行 else: # Initial Exploation Phase print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)), eps = 1.0 #---------------------1¥打印现在的step,需要学习的步子(例如 Initial Exploration : 173/1000 steps) else: # Evaluation print("Policy is Frozen") eps = 0.05 # Generate an Action by e-greedy action selection action, q_now = self.q_net.e_greedy(state_, eps)#-----------------------------------------3维度数组state_和1.0 return action, eps, q_now, obs_array # server.py 120行 self.agent.agent_step_update(reward, action, eps, q_now, obs_array) def agent_step_update(self, reward, action, eps, q_now, obs_array): # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False) print "------------------- Real Index%d" % (self.q_net.data_index)#%int self.actions_evaluate.append(self.last_action) if self.actions_evaluate[-1] == self.actions.index(2) and reward >= 1.0 and len(self.actions_evaluate) == 4: if [self.actions_evaluate[i]for i in xrange(3)] == ([1, 0, 1]or[1, 0, 1]): index = np.asanyarray(self.q_net.data_index, dtype=np.int8) for i in xrange(1, len(self.actions_evaluate)+1): self.q_net.d[2][index - i] -= 0.5 #-----# self.action_evaluate = deque()----------------------------------------------!!!!!!!!!!!!!!!!!!!!!!!!!! self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Simple text based visualization if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) print('Step:%d Action:%d Reward:%.1f Epsilon:%.6f Q_max:%3f' % ( self.time, self.q_net.action_to_index(action), reward, eps, q_max)) # ¥Step:92 Action:0 Reward:0.0 Epsilon:1.000000 Q_max:0.000000 # Updates for next step self.last_observation = obs_array#:::::::::::::::::::::::::::::::::::::::: if self.policy_frozen is False: self.last_action = copy.deepcopy(action) self.last_state = self.state.copy() self.time += 1 def agent_end(self, reward): # Episode Terminated!! print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon)) # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state, True)#---------------------------------------------------------------- self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Time count if self.policy_frozen is False: self.time += 1
class CnnDqnAgent(object): policy_frozen = False epsilon_delta = 1.0 / 10**4.4 min_eps = 0.1 actions = [0, 1, 2] cnn_feature_extractor = 'alexnet_feature_extractor.pickle' model = 'bvlc_alexnet.caffemodel' model_type = 'alexnet' image_feature_dim = 256 * 6 * 6 image_feature_count = 1 def _observation_to_featurevec(self, observation): # TODO clean if self.image_feature_count == 1: return np.r_[ self.feature_extractor.feature(observation["image"][0]), observation["depth"][0]] elif self.image_feature_count == 4: return np.r_[ self.feature_extractor.feature(observation["image"][0]), self.feature_extractor.feature(observation["image"][1]), self.feature_extractor.feature(observation["image"][2]), self.feature_extractor.feature(observation["image"][3]), observation["depth"][0], observation["depth"][1], observation["depth"][2], observation["depth"][3]] else: print("not supported: number of camera") def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.depth_image_dim = options['depth_image_dim'] self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load( open(self.cnn_feature_extractor)) print("done") else: self.feature_extractor = CnnFeatureExtractor( self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim) #self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) #self.last_state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) def agent_start(self, observation): obs_array = self._observation_to_featurevec(observation) # Initialize State #self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8) self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) self.state[0] = obs_array state_ = np.asanyarray(self.state[0].reshape(1, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # reset lstm state self.q_net.action_model.reset() self.q_net.scene_model.reset() # Generate an Action e-greedy action, q_now = self.q_net.e_greedy(state_, self.epsilon) return_action = action # Update for next step self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() self.last_observation = obs_array return return_action def agent_step(self, reward, observation): obs_array = self._observation_to_featurevec(observation) #obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation #if self.q_net.hist_size == 4: # self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8) #elif self.q_net.hist_size == 2: # self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8) #elif self.q_net.hist_size == 1: # self.state = np.asanyarray([obs_array], dtype=np.uint8) #else: # print("self.DQN.hist_size err") np.append(self.state, obs_array) #self.state = np.asanyarray(self.state[len(self.state) - self.q_net.hist_size:len(self.state)], dtype=np.uint8) self.state = np.asanyarray( self.state[len(self.state) - self.q_net.hist_size:len(self.state)], dtype=np.float32) state_ = np.asanyarray(self.state[self.q_net.hist_size - 1].reshape( 1, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Exploration decays along the time sequence if self.policy_frozen is False: # Learning ON/OFF if self.q_net.initial_exploration < self.time: self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon else: # Initial Exploation Phase print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)), eps = 1.0 else: # Evaluation print("Policy is Frozen") eps = 0.05 last_state_ = np.asanyarray(self.state[self.q_net.hist_size - 2].reshape( 1, self.q_net_input_dim), dtype=np.float32) #last_state_ = np.asanyarray(self.last_state[self.q_net.hist_size-1].reshape(1, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: last_state_ = cuda.to_gpu(last_state_) # Generate an Action by e-greedy action selection action, q_now, interest = self.q_net.e_greedy_with_interest( state_, eps, last_state_) print("interest is %f" % interest) return action, eps, q_now, obs_array, interest def agent_step_update(self, reward, action, eps, q_now, obs_array): # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Simple text based visualization if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) print('Step:%d Action:%d Reward:%.1f Epsilon:%.6f Q_max:%3f' % (self.time, self.q_net.action_to_index(action), reward, eps, q_max)) # Updates for next step self.last_observation = obs_array if self.policy_frozen is False: self.last_action = copy.deepcopy(action) self.last_state = self.state.copy() self.time += 1 def agent_end(self, reward): # Episode Terminated print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon)) # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state, True) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Time count if self.policy_frozen is False: self.time += 1
class CnnDqnAgent(object): epsilon_delta = 1.0 / 10**4.4 #deltaの減少量 min_eps = 0.1 #deltaの最小値 actions = range(3) cnn_feature_extractor = 'alexnet_feature_extractor.pickle' #1 model = 'bvlc_alexnet.caffemodel' #2 model_type = 'alexnet' #3 image_feature_dim = 256 * 6 * 6 image_count = 1 def _observation_to_featurevec(self, observation): # TODO clean if self.image_count == 1: return np.r_[ self.feature_extractor.feature(observation["image"][0]), observation["depth"][0]] elif self.image_count == 4: return np.r_[ self.feature_extractor.feature(observation["image"][0]), self.feature_extractor.feature(observation["image"][1]), self.feature_extractor.feature(observation["image"][2]), self.feature_extractor.feature(observation["image"][3]), observation["depth"][0], observation["depth"][1], observation["depth"][2], observation["depth"][3]] else: print("not supported: number of camera") def agent_init(self, **options): try: self.image_count = options['image_count'] self.depth_image_dim = options['depth_image_dim'] self.use_gpu = options['use_gpu'] self.test = options['test'] self.folder = options["folder"] #save_modelで使う->self. model_num = options['model_num'] self.q_net_input_dim = self.image_feature_dim * self.image_count + self.depth_image_dim * self.image_count if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load( open(self.cnn_feature_extractor)) print("done") else: self.feature_extractor = CnnFeatureExtractor( self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w')) print("pickle.dump finished") self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim) self.time = model_num + 1 #saveとloadが同時に行われることを防ぐため if (self.test): self.epsilon = 0.0 else: non_exploration = max( self.time - self.q_net.initial_exploration, 0) self.epsilon = max(1.0 - non_exploration * self.epsilon_delta, self.min_eps) print "epsilon = ", self.epsilon if (self.test or model_num > 0): self.q_net.load_model(self.folder, model_num) except: import traceback import sys traceback.print_exc() sys.exit() # 行動取得系,state更新系メソッド def agent_start(self, observation): try: obs_array = self._observation_to_featurevec(observation) # Initialize State self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8) self.state[0] = obs_array state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Generate an Action e-greedy action, q_now = self.q_net.e_greedy(state_, self.epsilon) return_action = action # Update for next step self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() return return_action except: import traceback import sys traceback.print_exc() sys.exit() # 行動取得系,state更新系メソッド def agent_step(self, observation): try: obs_array = self._observation_to_featurevec(observation) # Compose State : 4-step sequential observation if self.q_net.hist_size == 4: self.state = np.asanyarray( [self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 2: self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 1: self.state = np.asanyarray([obs_array], dtype=np.uint8) else: print("self.DQN.hist_size err") # q_funcに入れる際は(サンプル数,hist_size,q_net_input_dim) state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Exploration decays along the time sequence if self.test is False: # Learning ON/OFF if self.q_net.initial_exploration < self.time: #timeが1000を超えたら self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon #最初に1000回ランダムに行動 else: print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)), eps = 1.0 else: # Evaluation print("Policy is Frozen") eps = 0.0 # Generate an Action by e-greedy action selection action, q_now = self.q_net.e_greedy(state_, eps) return action, eps, q_now, obs_array except: import traceback import sys traceback.print_exc() sys.exit() # 学習系メソッド def agent_step_update(self, reward, action, eps, q_now): try: # Learning Phase if self.test is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Simple text based visualization if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) print('Step:%d Action:%d Reward:%.1f Epsilon:%.6f Q_max:%3f' % (self.time, self.q_net.action_to_index(action), reward, eps, q_max)) if self.test is False: self.last_action = copy.deepcopy(action) self.last_state = self.state.copy() # save model if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.save_model_freq) == 0: print "------------------Save Model------------------" self.q_net.save_model(self.folder, self.time) # Time count self.time += 1 except: import traceback import sys traceback.print_exc() sys.exit() # 学習系メソッド def agent_end(self, reward): # Episode Terminated try: print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon)) # Learning Phase if self.test is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state, True) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() if self.test is False: # Model Save if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.save_model_freq) == 0: print "------------------Save Model------------------" self.q_net.save_model(self.time, self.velocity) # Time count self.time += 1 except: import traceback import sys traceback.print_exc() sys.exit()
class CnnDqnAgent(object): policy_frozen = False # 学習をやめて、実行だけしたいときはTrueにする epsilon_delta = 1.0 / 10**4.4 min_eps = 0.1 # press, up, down, left, right, none num_of_action_type = 6 num_of_pad = 5 cnn_feature_extractor = 'alexnet_feature_extractor.pickle' model = 'bvlc_alexnet.caffemodel' model_type = 'alexnet' image_feature_dim = 256 * 6 * 6 def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.pad_state_dim = options['pad_states_dim'] self.q_net_input_dim = self.image_feature_dim + self.pad_state_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load( open(self.cnn_feature_extractor)) else: print("pickle.dump start") self.feature_extractor = CnnFeatureExtractor( self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'wb')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.num_of_action_type, self.num_of_pad, self.q_net_input_dim) def agent_start(self, observation): obs_array = np.r_[self.feature_extractor.feature(observation["image"]), observation["pad_states"]] # Initialize State self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8) self.state[0] = obs_array state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Generate an Action e-greedy action, q_now = self.q_net.e_greedy(state_, self.epsilon) return_action = action # Update for next step self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() self.last_observation = obs_array return return_action def agent_step(self, reward, observation): obs_array = np.r_[self.feature_extractor.feature(observation["image"]), observation["pad_states"]] #obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation if self.q_net.hist_size == 4: self.state = np.asanyarray( [self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 2: self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 1: self.state = np.asanyarray([obs_array], dtype=np.uint8) else: print("self.DQN.hist_size err") state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Exploration decays along the time sequence if self.policy_frozen is False: # Learning ON/OFF if self.q_net.initial_exploration < self.time: self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon else: # Initial Exploation Phase print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)), eps = 1.0 else: # Evaluation print("Policy is Frozen") eps = 0.05 # Generate an Action by e-greedy action selection action, q_now = self.q_net.e_greedy(state_, eps) return action, eps, q_now, obs_array def agent_step_update(self, reward, action, eps, q_now, obs_array): # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Simple text based visualization if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) print('Step:%d Reward:%f Epsilon:%.6f Q_max:%3f' % (self.time, reward, eps, q_max)) print('Action: {0}'.format(action)) # Updates for next step self.last_observation = obs_array if self.policy_frozen is False: self.last_action = copy.deepcopy(action) self.last_state = self.state.copy() self.time += 1 def agent_end(self, reward): # Episode Terminated print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon)) # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state, True) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Time count if self.policy_frozen is False: self.time += 1
class CnnDqnAgent(object): def __init__(self): super(CnnDqnAgent, self).__init__() self.policy_frozen = False self.epsilon_delta = 1.0 / 10**4.4 self.min_eps = 0.1 self.actions = [0, 1, 2] self.cnn_feature_extractor = 'alexnet_feature_extractor.pickle' self.model = 'bvlc_alexnet.caffemodel' self.model_type = 'alexnet' self.image_feature_dim = 256 * 6 * 6 self.image_feature_count = 1 self.prediction_update_tick = 0 def _observation_to_featurevec(self, observation): feature_image = [ self.feature_extractor(observation["image"][i]) for i in range(self.image_feature_count) ] return np.concatenate(feature_image + observation["depth"]) def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.depth_image_dim = options['depth_image_dim'] self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor) with open(self.cnn_feature_extractor, 'rb') as f: self.feature_extractor = pickle.load(f) print("done") else: print('there is no chainer alexnet model file ', self.cnn_feature_extractor) print('making chainer model from ', self.model) print('this process take a tens of minutes.') self.feature_extractor = CnnFeatureExtractor( self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'wb')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim) def agent_start(self, observation): # Initialize State self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) new_feature_vec = self._observation_to_featurevec(observation) self.state[0, :] = new_feature_vec # Generate an Action e-greedy state_ = np.expand_dims(self.state, 0) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_, device=self.use_gpu) action, _, deg_intereset = self.q_net.e_greedy(state_, self.epsilon) return_action = action # Update for next step self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() self.last_observation = new_feature_vec return return_action, deg_intereset def agent_step(self, reward, observation): new_feature_vec = self._observation_to_featurevec(observation) past_states = self.state[0:-1, :] self.state[0, :] = new_feature_vec self.state[1:, :] = past_states # Exploration decays along the time sequence state_ = np.expand_dims(self.state, 0) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_, device=self.use_gpu) if self.policy_frozen is False: # Learning ON/OFF if self.q_net.initial_exploration < self.time: self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon else: # Initial Exploation Phase print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)), eps = 1.0 else: # Evaluation print("Policy is Frozen") eps = 0.05 # Generate an Action by e-greedy action selection action, q_now, deg_intereset = self.q_net.e_greedy(state_, eps) return action, eps, q_now, new_feature_vec, deg_intereset def agent_step_update(self, reward, action, eps, q_now, new_feature_vec, deg_intereset): # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False) self.q_net.experience_replay(self.time) self.prediction_update_tick += 1 if self.prediction_update_tick >= 10: self.prediction_update_tick = 0 print('prediction update') self.q_net.prediction_update() # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Simple text based visualization if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) print( 'Step:%d Action:%d Reward:%.1f Epsilon:%.6f Q_max:%3f def_interest:%3f' % (self.time, self.q_net.action_to_index(action), reward, eps, q_max, deg_intereset)) # Updates for next step self.last_observation = new_feature_vec if self.policy_frozen is False: self.last_action = copy.deepcopy(action) self.last_state = self.state.copy() self.time += 1 def agent_end(self, reward): # Episode Terminated print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon)) # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state, True) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Time count if self.policy_frozen is False: self.time += 1