def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.depth_image_dim = options['depth_image_dim'] self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor) with open(self.cnn_feature_extractor, 'rb') as f: self.feature_extractor = pickle.load(f) print("done") else: print('there is no chainer alexnet model file ', self.cnn_feature_extractor) print('making chainer model from ', self.model) print('this process take a tens of minutes.') self.feature_extractor = CnnFeatureExtractor( self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'wb')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)
def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.agent_count = options['agent_count'] self.image_count = options['rgb_image_count'] self.depth_image_dim = options['depth_image_dim'] self.ir_idm = options['ir_dim'] self.ground_dim = options['ground_dim'] self.compass_dim = options['compass_dim'] self.target_dim = options['target_dim'] self.model = options['model'] self.cnn_input_dim = self.image_dim * self.image_count self.feature_dim = self.image_feature_dim * self.image_feature_count self.other_input_dim = self.depth_image_dim + self.ir_idm + self.ground_dim + self.compass_dim + self.target_dim self.time = 1 self.epsilon = 1.0 self.avgloss_log_file = self.avgloss_log + "avg_loss.log" if self.model != 'None': self.policy_frozen = False self.epsilon = 0.5 self.q_net = QNet(self.use_gpu, self.actions, self.cnn_input_dim, self.feature_dim, self.agent_count, self.other_input_dim, self.model)
def __init__(self, g_list, test_g_list, env): self.g_list = g_list if test_g_list is None: self.test_g_list = g_list else: self.test_g_list = test_g_list self.env = env self.net = QNet() self.old_net = QNet() self.optimizer = optim.Adam(self.net.parameters(), lr=cmd_args.learning_rate) if cmd_args.ctx == 'gpu': self.net = self.net.cuda() self.old_net = self.old_net.cuda() self.eps_start = 1.0 self.eps_end = 1.0 self.eps_step = 10000 self.burn_in = 100 # number of iterations to run first set ("intial burning in to memory") of simulations? self.step = 0 self.best_eval = None self.pos = 0 self.sample_idxes = list(range(len(g_list))) random.shuffle(self.sample_idxes) self.take_snapshot()
def agent_init(self, **options): self.use_gpu = options['use_gpu'] #self.depth_image_dim = options['depth_image_dim'] self.q_net_input_dim = self.image_feature_dim * self.image_feature_count #+ self.depth_image_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load(open(self.cnn_feature_extractor)) print("done") else: self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)
def agent_init(self, **options): try: self.image_count = options['image_count'] self.depth_image_dim = options['depth_image_dim'] self.use_gpu = options['use_gpu'] self.test = options['test'] self.folder = options["folder"] #save_modelで使う->self. model_num = options['model_num'] self.q_net_input_dim = self.image_feature_dim * self.image_count + self.depth_image_dim * self.image_count if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load( open(self.cnn_feature_extractor)) print("done") else: self.feature_extractor = CnnFeatureExtractor( self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w')) print("pickle.dump finished") self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim) self.time = model_num + 1 #saveとloadが同時に行われることを防ぐため if (self.test): self.epsilon = 0.0 else: non_exploration = max( self.time - self.q_net.initial_exploration, 0) self.epsilon = max(1.0 - non_exploration * self.epsilon_delta, self.min_eps) print "epsilon = ", self.epsilon if (self.test or model_num > 0): self.q_net.load_model(self.folder, model_num) except: import traceback import sys traceback.print_exc() sys.exit()
def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.pad_state_dim = options['pad_states_dim'] self.q_net_input_dim = self.image_feature_dim + self.pad_state_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load( open(self.cnn_feature_extractor)) else: print("pickle.dump start") self.feature_extractor = CnnFeatureExtractor( self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'wb')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.num_of_action_type, self.num_of_pad, self.q_net_input_dim)
def __init__(self, obs_dims, act_dim, lr=1e-3, gamma=0.99, replay_buffer_size=10000, batch_size=64, epsilon_min=0.01, epsilon_dec=5e-5, target_update_frequency=64): self.buffer = ReplayBuffer(replay_buffer_size, obs_dims) self.batch_size = batch_size self.q_eval = QNet(obs_dims, act_dim) self.q_target = QNet(obs_dims, act_dim) self.obs_dims = obs_dims self.act_dim = act_dim self.learn_ctr = 0 self.target_update_frequency = target_update_frequency self.gamma = gamma self.epsilon = 1 self.epsilon_min = epsilon_min self.epsilon_dec = epsilon_dec self.optimizer = torch.optim.Adam(self.q_eval.parameters(), lr=lr) self.loss_fn = torch.nn.MSELoss()
def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.depth_image_dim = options['depth_image_dim'] self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load(open(self.cnn_feature_extractor)) print("done") else: self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)
def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.pad_state_dim = options['pad_states_dim'] self.q_net_input_dim = self.image_feature_dim + self.pad_state_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load(open(self.cnn_feature_extractor)) else: print("pickle.dump start") self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'wb')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.num_of_action_type, self.num_of_pad, self.q_net_input_dim)
class CnnDqnAgent(object): policy_frozen = False epsilon_delta = 1.0 / (6 * 10**4) min_eps = 0.1 actions = [0, 1, 2, 3, 4, 5, 6, 7] image_feature_dim = 14 * 14 image_feature_count = 32 image_dim = 128 * 128 avgloss_log = '/home/ohk/Documents/playground/Assets/log/' def _observation_to_state_cnn(self, observation): temp = [] for i in range(len(observation["image"])): temp.append(np.r_[observation["image"][i]]) return np.r_[temp] def _observation_to_state_other(self, observation): temp = [] # change in another network structure for i in range(len(observation["ir"])): temp.append(np.r_[observation["ir"][i], observation["compass"][i], observation["target"][i]]) return np.r_[temp] def _reshape_for_cnn(self, state, hist_size, x, y): state_ = np.zeros((self.agent_count, 3 * hist_size, 128, 128), dtype=np.float32) for i in range(self.agent_count): if hist_size == 1: state_[i] = state[i][0].transpose(2, 0, 1) elif hist_size == 2: state_[i] = np.c_[state[i][0], state[i][1]].transpose(2, 0, 1) elif hist_size == 4: state_[i] = np.c_[state[i][0], state[i][1], state[i][2], state[i][3]].transpose(2, 0, 1) return state_ def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.agent_count = options['agent_count'] self.image_count = options['rgb_image_count'] self.depth_image_dim = options['depth_image_dim'] self.ir_idm = options['ir_dim'] self.ground_dim = options['ground_dim'] self.compass_dim = options['compass_dim'] self.target_dim = options['target_dim'] self.model = options['model'] self.cnn_input_dim = self.image_dim * self.image_count self.feature_dim = self.image_feature_dim * self.image_feature_count self.other_input_dim = self.depth_image_dim + self.ir_idm + self.ground_dim + self.compass_dim + self.target_dim self.time = 1 self.epsilon = 1.0 self.avgloss_log_file = self.avgloss_log + "avg_loss.log" if self.model != 'None': self.policy_frozen = False self.epsilon = 0.5 self.q_net = QNet(self.use_gpu, self.actions, self.cnn_input_dim, self.feature_dim, self.agent_count, self.other_input_dim, self.model) def agent_start(self, observation, reward): obs_cnn_array = self._observation_to_state_cnn(observation) obs_other_array = self._observation_to_state_other(observation) # Initialize State self.state_cnn = np.zeros( (self.agent_count, self.q_net.hist_size, 128, 128, 3), dtype=np.uint8) for i in range(self.agent_count): self.state_cnn[i][self.q_net.hist_size - 1] = obs_cnn_array[i] state_cnn_ = self._reshape_for_cnn(self.state_cnn, self.q_net.hist_size, 128, 128) state_cnn_ /= 255.0 self.state_other = np.zeros( (self.agent_count, self.q_net.hist_size, self.other_input_dim), dtype=np.uint8) for i in range(self.agent_count): self.state_other[i][self.q_net.hist_size - 1] = obs_other_array[i] state_other_ = np.asanyarray(self.state_other.reshape( self.agent_count, self.q_net.hist_size * self.other_input_dim), dtype=np.float32) state_other_ /= 255.0 if self.use_gpu >= 0: state_cnn_ = cuda.to_gpu(state_cnn_) state_other_ = cuda.to_gpu(state_other_) if self.policy_frozen is False: # Learning ON/OFF if self.q_net.initial_exploration <= self.time: self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon print(("\naTraining Now. Time step : %d Epsilon : %.6f" % (self.time, eps))) else: # Initial Exploation Phase eps = 1.0 print(("\naInitial Exploration S : %d/%d Epsilon : %.6f" % (self.time, self.q_net.initial_exploration, eps))) # Generate an Action e-greedy action, q_now = self.q_net.e_greedy(state_cnn_, state_other_, self.epsilon, reward) # Update for next step self.last_action = action.copy() self.last_state_cnn = self.state_cnn.copy() self.last_state_other = self.state_other.copy() del state_cnn_, state_other_, obs_cnn_array, obs_other_array gc.collect() self.time += 1 return action, q_now def agent_step(self, reward, observation): obs_cnn_array = self._observation_to_state_cnn(observation) obs_other_array = self._observation_to_state_other(observation) # img = observation["image"][0] # img.save("img.png") # Compose State : 4-step sequential observation for i in range(self.agent_count): if self.q_net.hist_size == 4: self.state_cnn[i] = np.asanyarray([ self.state_cnn[i][1], self.state_cnn[i][2], self.state_cnn[i][3], obs_cnn_array[i] ], dtype=np.uint8) if (obs_other_array.size != 0): self.state_other[i] = np.asanyarray([ self.state_other[i][1], self.state_other[i][2], self.state_other[i][3], obs_other_array[i] ], dtype=np.uint8) elif self.q_net.hist_size == 2: self.state_cnn[i] = np.asanyarray( [self.state_cnn[i][1], obs_cnn_array[i]], dtype=np.uint8) if (obs_other_array.size != 0): self.state_other[i] = np.asanyarray( [self.state_other[i][1], obs_other_array[i]], dtype=np.uint8) elif self.q_net.hist_size == 1: self.state_cnn[i] = np.asanyarray([obs_cnn_array[i]], dtype=np.uint8) if (obs_other_array.size != 0): self.state_other[i] = np.asanyarray([obs_other_array[i]], dtype=np.uint8) else: print("self.DQN.hist_size err") state_cnn_ = self._reshape_for_cnn(self.state_cnn, self.q_net.hist_size, 128, 128) state_cnn_ /= 255.0 state_other_ = np.asanyarray(self.state_other.reshape( self.agent_count, self.q_net.hist_size * self.other_input_dim), dtype=np.float32) state_other_ /= 255.0 if self.use_gpu >= 0: state_cnn_ = cuda.to_gpu(state_cnn_) state_other_ = cuda.to_gpu(state_other_) # Exploration decays along the time sequence if self.policy_frozen is False: # Learning ON/OFF if self.q_net.initial_exploration <= self.time: self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon print(("\nbTraining Now. Time step : %d Epsilon : %.6f" % (self.time, eps))) else: # Initial Exploation Phase eps = 1.0 print(("\nInitial Exploration : %d/%d Epsilon : %.6f" % (self.time, self.q_net.initial_exploration, eps))) else: # Evaluation eps = 0.05 print(("\nPolicy is Frozen. Time step : %d Epsilon : %.6f" % (self.time, eps))) # Generate an Action by e-greedy action selection action, q_now = self.q_net.e_greedy(state_cnn_, state_other_, eps, reward) del state_cnn_, state_other_, obs_cnn_array, obs_other_array gc.collect() return action, eps, q_now def agent_step_update(self, reward, action, eps): # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state_cnn, self.last_state_other, self.last_action, reward, self.state_cnn, self.state_other, False) self.q_net.experience_replay(self.time) if self.policy_frozen is False: self.last_action = action.copy() # copy.deepcopy(action) self.last_state_cnn = self.state_cnn.copy() self.last_state_other = self.state_other.copy() self.time += 1 def agent_end(self, reward): # Episode Terminated print(('episode finished. Time step : %d' % self.time)) print(("agent"), end=' ') for i in range(self.agent_count): print(("[%02d] ( )reward(%06.2f)" % (i, reward[i])), end=' ') if i % 5 == 4: print(("\n "), end=' ') # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state_cnn, self.last_state_other, self.last_action, reward, self.last_state_cnn, self.last_state_other, True) self.q_net.experience_replay(self.time) avg_episode_loss = 0 if self.q_net.time_of_episode != 0: avg_episode_loss = self.q_net.loss_per_episode / self.q_net.time_of_episode self.q_net.loss_per_episode = 0 self.q_net.time_of_episode = 0 with open(self.avgloss_log_file, 'a') as the_file: the_file.write(str(self.time) + "," + str(avg_episode_loss) + "\n") # Time count # if self.policy_frozen is False: self.time += 1
class CnnDqnAgent(object): policy_frozen = False epsilon_delta = 1.0 / 10 ** 4.4 min_eps = 0.1 actions = [0, 1, 2] cnn_feature_extractor = 'alexnet_feature_extractor.pickle' model = 'bvlc_alexnet.caffemodel' model_type = 'alexnet' image_feature_dim = 256 * 6 * 6 image_feature_count = 1 def _observation_to_featurevec(self, observation): # TODO clean if self.image_feature_count == 1: return np.r_[self.feature_extractor.feature(observation["image"][0]), observation["depth"][0]] elif self.image_feature_count == 4: return np.r_[self.feature_extractor.feature(observation["image"][0]), self.feature_extractor.feature(observation["image"][1]), self.feature_extractor.feature(observation["image"][2]), self.feature_extractor.feature(observation["image"][3]), observation["depth"][0], observation["depth"][1], observation["depth"][2], observation["depth"][3]] else: print("not supported: number of camera") def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.depth_image_dim = options['depth_image_dim'] self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load(open(self.cnn_feature_extractor)) print("done") else: self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim) def agent_start(self, observation): obs_array = self._observation_to_featurevec(observation) # Initialize State self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8) self.state[0] = obs_array state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Generate an Action e-greedy action, q_now = self.q_net.e_greedy(state_, self.epsilon) return_action = action # Update for next step self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() self.last_observation = obs_array return return_action def agent_step(self, reward, observation): obs_array = self._observation_to_featurevec(observation) #obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation if self.q_net.hist_size == 4: self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 2: self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 1: self.state = np.asanyarray([obs_array], dtype=np.uint8) else: print("self.DQN.hist_size err") state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Exploration decays along the time sequence if self.policy_frozen is False: # Learning ON/OFF if self.q_net.initial_exploration < self.time: self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon else: # Initial Exploation Phase print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)), eps = 1.0 else: # Evaluation print("Policy is Frozen") eps = 0.05 # Generate an Action by e-greedy action selection action, q_now = self.q_net.e_greedy(state_, eps) return action, eps, q_now, obs_array def agent_step_update(self, reward, action, eps, q_now, obs_array): # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Simple text based visualization if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) print('Step:%d Action:%d Reward:%.1f Epsilon:%.6f Q_max:%3f' % ( self.time, self.q_net.action_to_index(action), reward, eps, q_max)) # Updates for next step self.last_observation = obs_array if self.policy_frozen is False: self.last_action = copy.deepcopy(action) self.last_state = self.state.copy() self.time += 1 def agent_end(self, reward): # Episode Terminated print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon)) # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state, True) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Time count if self.policy_frozen is False: self.time += 1
class DqnAgent(): def __init__(self, obs_dims, act_dim, lr=1e-3, gamma=0.99, replay_buffer_size=10000, batch_size=64, epsilon_min=0.01, epsilon_dec=5e-5, target_update_frequency=64): self.buffer = ReplayBuffer(replay_buffer_size, obs_dims) self.batch_size = batch_size self.q_eval = QNet(obs_dims, act_dim) self.q_target = QNet(obs_dims, act_dim) self.obs_dims = obs_dims self.act_dim = act_dim self.learn_ctr = 0 self.target_update_frequency = target_update_frequency self.gamma = gamma self.epsilon = 1 self.epsilon_min = epsilon_min self.epsilon_dec = epsilon_dec self.optimizer = torch.optim.Adam(self.q_eval.parameters(), lr=lr) self.loss_fn = torch.nn.MSELoss() def update_target(self): if self.learn_ctr % self.target_update_frequency == 0: self.q_target.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): if self.epsilon > self.epsilon_min: self.epsilon = self.epsilon - self.epsilon_dec def choose_action(self, obs): if np.random.sample() < self.epsilon: return np.random.randint(self.act_dim) else: obs = torch.tensor(np.expand_dims(obs, axis=0), dtype=torch.float) return torch.argmax(self.q_eval(obs)).item() def store_transition(self, obs, act, rew, _obs, done): self.buffer.push(obs, act, rew, _obs, done) def sample_replay_buffer(self): return self.buffer.sample(self.batch_size) def learn(self): self.optimizer.zero_grad() obs, act, rew, _obs, done = self.sample_replay_buffer() obs = torch.tensor(obs, dtype=torch.float) act = torch.tensor(act, dtype=torch.long) rew = torch.tensor(rew, dtype=torch.long) _obs = torch.tensor(_obs, dtype=torch.float) done = torch.tensor(done, dtype=torch.long) idxs = torch.tensor(np.arange(self.batch_size), dtype=torch.long) q_pred = self.q_eval(obs)[idxs, act] q_next = self.q_target(_obs).max(dim=1)[0] q_target = rew + (1 - done) * self.gamma * q_next loss = self.loss_fn(q_target, q_pred) loss.backward() self.optimizer.step() self.update_target() self.decrement_epsilon()
class Agent(object): def __init__(self, g_list, test_g_list, env): self.g_list = g_list if test_g_list is None: self.test_g_list = g_list else: self.test_g_list = test_g_list self.env = env self.net = QNet() self.old_net = QNet() self.optimizer = optim.Adam(self.net.parameters(), lr=cmd_args.learning_rate) if cmd_args.ctx == 'gpu': self.net = self.net.cuda() self.old_net = self.old_net.cuda() self.eps_start = 1.0 self.eps_end = 1.0 self.eps_step = 10000 self.burn_in = 100 # number of iterations to run first set ("intial burning in to memory") of simulations? self.step = 0 self.best_eval = None self.pos = 0 self.sample_idxes = list(range(len(g_list))) random.shuffle(self.sample_idxes) self.take_snapshot() def take_snapshot(self): self.old_net.load_state_dict(self.net.state_dict()) # type = 0 for add, 1 for subtract def make_actions(self, greedy=True, _type=0): self.eps = self.eps_end + max( 0., (self.eps_start - self.eps_end) * (self.eps_step - max(0., self.step)) / self.eps_step) cur_state = self.env.getStateRef() actions, q_arrs = self.net(cur_state, None, greedy_acts=True, _type=_type) q_vals = [] for i in range(len(q_arrs)): tmp = q_arrs[i].numpy() tmp = tmp[actions[i]][0] q_vals.append(tmp) return actions, q_vals def run_simulation(self): self.env.setup(g_list) avg_rewards = [] t_a, t_s = 0, 0 for asdf in range(GLOBAL_EPISODE_STEPS): if asdf % 2 == 0: assert self.env.first_nodes == None for i in range(len(self.g_list)): g = self.g_list[i].to_networkx() con_nodes = list(set(list(sum(g.edges, ())))) for j in range(20): if (j not in con_nodes): rand_num = np.random.randint(0, 20) g.add_edge(j, rand_num) self.env.added_edges.append((j, rand_num)) self.g_list[i] = S2VGraph(g, label=self.g_list[i].label) action_type = (asdf % 4) // 2 # get Actions list_at, _ = self.make_actions(_type=action_type) # save State list_st = self.env.cloneState() cur_state = self.env.getStateRef() _, predicted_Q = self.net(cur_state, None, greedy_acts=False, _type=action_type) # get Rewards if self.env.first_nodes is not None: rewards = self.env.get_rewards(list_at, _type=action_type) avg_rewards.append(sum(rewards) / len(rewards)) else: rewards = [0] * len(g_list) # Update graph to get S' self.env.step(list_at, _type=action_type) # get next state if env.isTerminal(): s_prime = None else: s_prime = self.env.cloneState() # get S'and A' values try: sprime_at, q_primes = self.make_actions(_type=action_type) except: continue # Calculate Q(S', A') actual_Q = torch.Tensor(rewards) + torch.Tensor(q_primes) # Pass loss to network loss = F.mse_loss(predicted_Q, actual_Q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return avg_rewards def train(self): # set up progress bar pbar = tqdm(range(GLOBAL_NUM_STEPS), unit='steps') avgs = [] # for each iteration for self.step in pbar: # run simulation # side effects? avgs += self.run_simulation() #print("tmp: ", tmp) #avg_reward_step.append(sum(tmp)/len(tmp)) #plt.plot(tmp) #plt.show() #plt.savefig('test.png') print("avgs: ", avgs) mov_avg = np.convolve(np.array(avgs), np.ones(4), 'valid') / 4 print("mov avg: ", list(mov_avg)) print(type(mov_avg)) print(mov_avg.shape) plt.clf() plt.plot(list(mov_avg)) plt.title('running average of average rewards') plt.savefig("Results.png") plt.show()
class CnnDqnAgent(object): def __init__(self): super(CnnDqnAgent, self).__init__() self.policy_frozen = False self.epsilon_delta = 1.0 / 10**4.4 self.min_eps = 0.1 self.actions = [0, 1, 2] self.cnn_feature_extractor = 'alexnet_feature_extractor.pickle' self.model = 'bvlc_alexnet.caffemodel' self.model_type = 'alexnet' self.image_feature_dim = 256 * 6 * 6 self.image_feature_count = 1 self.prediction_update_tick = 0 def _observation_to_featurevec(self, observation): feature_image = [ self.feature_extractor(observation["image"][i]) for i in range(self.image_feature_count) ] return np.concatenate(feature_image + observation["depth"]) def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.depth_image_dim = options['depth_image_dim'] self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor) with open(self.cnn_feature_extractor, 'rb') as f: self.feature_extractor = pickle.load(f) print("done") else: print('there is no chainer alexnet model file ', self.cnn_feature_extractor) print('making chainer model from ', self.model) print('this process take a tens of minutes.') self.feature_extractor = CnnFeatureExtractor( self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'wb')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim) def agent_start(self, observation): # Initialize State self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) new_feature_vec = self._observation_to_featurevec(observation) self.state[0, :] = new_feature_vec # Generate an Action e-greedy state_ = np.expand_dims(self.state, 0) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_, device=self.use_gpu) action, _, deg_intereset = self.q_net.e_greedy(state_, self.epsilon) return_action = action # Update for next step self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() self.last_observation = new_feature_vec return return_action, deg_intereset def agent_step(self, reward, observation): new_feature_vec = self._observation_to_featurevec(observation) past_states = self.state[0:-1, :] self.state[0, :] = new_feature_vec self.state[1:, :] = past_states # Exploration decays along the time sequence state_ = np.expand_dims(self.state, 0) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_, device=self.use_gpu) if self.policy_frozen is False: # Learning ON/OFF if self.q_net.initial_exploration < self.time: self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon else: # Initial Exploation Phase print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)), eps = 1.0 else: # Evaluation print("Policy is Frozen") eps = 0.05 # Generate an Action by e-greedy action selection action, q_now, deg_intereset = self.q_net.e_greedy(state_, eps) return action, eps, q_now, new_feature_vec, deg_intereset def agent_step_update(self, reward, action, eps, q_now, new_feature_vec, deg_intereset): # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False) self.q_net.experience_replay(self.time) self.prediction_update_tick += 1 if self.prediction_update_tick >= 10: self.prediction_update_tick = 0 print('prediction update') self.q_net.prediction_update() # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Simple text based visualization if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) print( 'Step:%d Action:%d Reward:%.1f Epsilon:%.6f Q_max:%3f def_interest:%3f' % (self.time, self.q_net.action_to_index(action), reward, eps, q_max, deg_intereset)) # Updates for next step self.last_observation = new_feature_vec if self.policy_frozen is False: self.last_action = copy.deepcopy(action) self.last_state = self.state.copy() self.time += 1 def agent_end(self, reward): # Episode Terminated print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon)) # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state, True) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Time count if self.policy_frozen is False: self.time += 1
class CnnDqnAgent(object): policy_frozen = False epsilon_delta = 1.0 / 10 ** 4.4# print '%.10f' %(1.0 / 10 ** 4.4) # 0.0000398107170553496878006617676337697275812388397753238677978515625 min_eps = 0.1 actions = [0, 1, 2] cnn_feature_extractor = 'alexnet_feature_extractor.pickle' model = 'bvlc_alexnet.caffemodel' model_type = 'alexnet' image_feature_dim = 256 * 6 * 6 image_feature_count = 1 actions_evaluate = deque(maxlen=4) #---------------------------------------------------------- def _observation_to_featurevec(self, observation): # TODO clean if self.image_feature_count == 1: #print observation["image"][0].shape, type(observation["image"][0])#会error因为不是np所以没有shape #print self.feature_extractor.feature(observation["image"][0]).shape#,\#返回的是1D的256*6*6 #observation["depth"][0].shape return np.r_[self.feature_extractor.feature(observation["image"][0])] #, observation["depth"][0]] elif self.image_feature_count == 4: return np.r_[self.feature_extractor.feature(observation["image"][0]), self.feature_extractor.feature(observation["image"][1]), self.feature_extractor.feature(observation["image"][2]), self.feature_extractor.feature(observation["image"][3])]# # observation["depth"][0], # observation["depth"][1], # observation["depth"][2], # observation["depth"][3]] else: print("not supported: number of camera") def agent_init(self, **options): self.use_gpu = options['use_gpu'] #self.depth_image_dim = options['depth_image_dim'] self.q_net_input_dim = self.image_feature_dim * self.image_feature_count #+ self.depth_image_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load(open(self.cnn_feature_extractor)) print("done") else: self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim) def agent_start(self, observation): obs_array = self._observation_to_featurevec(observation)#拿到前面去提取分析再r合并了 # Initialize State self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8) self.state[0] = obs_array state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Generate an Action e-greedy action, q_now = self.q_net.e_greedy(state_, self.epsilon)#return return_action1 return_action = action #return return_action2 print return_action, type(return_action)#------------------------------------------¥¥¥¥ Random 2 <type 'int'> # Update for next step self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy()#作为下个状态的开始 self.last_observation = obs_array #:::::::::::::::::::::::::::::::::::::::::::: return return_action #return return_action3 # action, q_now = self.q_net.e_greedy(state_, self.epsilon)-75 def agent_step(self, reward, observation): obs_array = self._observation_to_featurevec(observation)#拿到前面去提取分析再r_合并了 #obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation if self.q_net.hist_size == 4: self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 2: self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 1: self.state = np.asanyarray([obs_array], dtype=np.uint8) else: print("self.DQN.hist_size err") state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) #state_从self.state = np.asanyarray([obs_array], dtype=np.uint8)的uint8去小数变成shape(1,1,256*6*6)的float32 if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Exploration decays along the time sequence if self.policy_frozen is False: # Learning ON/OFF if self.q_net.initial_exploration < self.time:#10000<运行时间 self.epsilon -= self.epsilon_delta #那么开始渐渐减少eps if self.epsilon < self.min_eps: #如果eps已经被减少的快没了比预定的最小值还要小, self.epsilon = self.min_eps #则等于min_eps =0.1 eps = self.epsilon # self.epsilon = 1.0 ----61 理由是 if np.random.rand() < epsilon:q_net.py160行 else: # Initial Exploation Phase print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)), eps = 1.0 #---------------------1¥打印现在的step,需要学习的步子(例如 Initial Exploration : 173/1000 steps) else: # Evaluation print("Policy is Frozen") eps = 0.05 # Generate an Action by e-greedy action selection action, q_now = self.q_net.e_greedy(state_, eps)#-----------------------------------------3维度数组state_和1.0 return action, eps, q_now, obs_array # server.py 120行 self.agent.agent_step_update(reward, action, eps, q_now, obs_array) def agent_step_update(self, reward, action, eps, q_now, obs_array): # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False) print "------------------- Real Index%d" % (self.q_net.data_index)#%int self.actions_evaluate.append(self.last_action) if self.actions_evaluate[-1] == self.actions.index(2) and reward >= 1.0 and len(self.actions_evaluate) == 4: if [self.actions_evaluate[i]for i in xrange(3)] == ([1, 0, 1]or[1, 0, 1]): index = np.asanyarray(self.q_net.data_index, dtype=np.int8) for i in xrange(1, len(self.actions_evaluate)+1): self.q_net.d[2][index - i] -= 0.5 #-----# self.action_evaluate = deque()----------------------------------------------!!!!!!!!!!!!!!!!!!!!!!!!!! self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Simple text based visualization if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) print('Step:%d Action:%d Reward:%.1f Epsilon:%.6f Q_max:%3f' % ( self.time, self.q_net.action_to_index(action), reward, eps, q_max)) # ¥Step:92 Action:0 Reward:0.0 Epsilon:1.000000 Q_max:0.000000 # Updates for next step self.last_observation = obs_array#:::::::::::::::::::::::::::::::::::::::: if self.policy_frozen is False: self.last_action = copy.deepcopy(action) self.last_state = self.state.copy() self.time += 1 def agent_end(self, reward): # Episode Terminated!! print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon)) # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state, True)#---------------------------------------------------------------- self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Time count if self.policy_frozen is False: self.time += 1
class CnnDqnAgent(object): policy_frozen = False epsilon_delta = 1.0 / 10**4.4 min_eps = 0.1 actions = [0, 1, 2] cnn_feature_extractor = 'alexnet_feature_extractor.pickle' model = 'bvlc_alexnet.caffemodel' model_type = 'alexnet' image_feature_dim = 256 * 6 * 6 image_feature_count = 1 def _observation_to_featurevec(self, observation): # TODO clean if self.image_feature_count == 1: return np.r_[ self.feature_extractor.feature(observation["image"][0]), observation["depth"][0]] elif self.image_feature_count == 4: return np.r_[ self.feature_extractor.feature(observation["image"][0]), self.feature_extractor.feature(observation["image"][1]), self.feature_extractor.feature(observation["image"][2]), self.feature_extractor.feature(observation["image"][3]), observation["depth"][0], observation["depth"][1], observation["depth"][2], observation["depth"][3]] else: print("not supported: number of camera") def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.depth_image_dim = options['depth_image_dim'] self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load( open(self.cnn_feature_extractor)) print("done") else: self.feature_extractor = CnnFeatureExtractor( self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim) #self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) #self.last_state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) def agent_start(self, observation): obs_array = self._observation_to_featurevec(observation) # Initialize State #self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8) self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) self.state[0] = obs_array state_ = np.asanyarray(self.state[0].reshape(1, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # reset lstm state self.q_net.action_model.reset() self.q_net.scene_model.reset() # Generate an Action e-greedy action, q_now = self.q_net.e_greedy(state_, self.epsilon) return_action = action # Update for next step self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() self.last_observation = obs_array return return_action def agent_step(self, reward, observation): obs_array = self._observation_to_featurevec(observation) #obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation #if self.q_net.hist_size == 4: # self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8) #elif self.q_net.hist_size == 2: # self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8) #elif self.q_net.hist_size == 1: # self.state = np.asanyarray([obs_array], dtype=np.uint8) #else: # print("self.DQN.hist_size err") np.append(self.state, obs_array) #self.state = np.asanyarray(self.state[len(self.state) - self.q_net.hist_size:len(self.state)], dtype=np.uint8) self.state = np.asanyarray( self.state[len(self.state) - self.q_net.hist_size:len(self.state)], dtype=np.float32) state_ = np.asanyarray(self.state[self.q_net.hist_size - 1].reshape( 1, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Exploration decays along the time sequence if self.policy_frozen is False: # Learning ON/OFF if self.q_net.initial_exploration < self.time: self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon else: # Initial Exploation Phase print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)), eps = 1.0 else: # Evaluation print("Policy is Frozen") eps = 0.05 last_state_ = np.asanyarray(self.state[self.q_net.hist_size - 2].reshape( 1, self.q_net_input_dim), dtype=np.float32) #last_state_ = np.asanyarray(self.last_state[self.q_net.hist_size-1].reshape(1, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: last_state_ = cuda.to_gpu(last_state_) # Generate an Action by e-greedy action selection action, q_now, interest = self.q_net.e_greedy_with_interest( state_, eps, last_state_) print("interest is %f" % interest) return action, eps, q_now, obs_array, interest def agent_step_update(self, reward, action, eps, q_now, obs_array): # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Simple text based visualization if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) print('Step:%d Action:%d Reward:%.1f Epsilon:%.6f Q_max:%3f' % (self.time, self.q_net.action_to_index(action), reward, eps, q_max)) # Updates for next step self.last_observation = obs_array if self.policy_frozen is False: self.last_action = copy.deepcopy(action) self.last_state = self.state.copy() self.time += 1 def agent_end(self, reward): # Episode Terminated print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon)) # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state, True) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Time count if self.policy_frozen is False: self.time += 1
def agent_init(self): self.q_net = QNet(self.use_gpu, self.q_net_input_dim, self.agent_id)
class CnnDqnAgent(object): policy_frozen = False # 学習をやめて、実行だけしたいときはTrueにする epsilon_delta = 1.0 / 10**4.4 min_eps = 0.1 # press, up, down, left, right, none num_of_action_type = 6 num_of_pad = 5 cnn_feature_extractor = 'alexnet_feature_extractor.pickle' model = 'bvlc_alexnet.caffemodel' model_type = 'alexnet' image_feature_dim = 256 * 6 * 6 def agent_init(self, **options): self.use_gpu = options['use_gpu'] self.pad_state_dim = options['pad_states_dim'] self.q_net_input_dim = self.image_feature_dim + self.pad_state_dim if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load( open(self.cnn_feature_extractor)) else: print("pickle.dump start") self.feature_extractor = CnnFeatureExtractor( self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'wb')) print("pickle.dump finished") self.time = 0 self.epsilon = 1.0 # Initial exploratoin rate self.q_net = QNet(self.use_gpu, self.num_of_action_type, self.num_of_pad, self.q_net_input_dim) def agent_start(self, observation): obs_array = np.r_[self.feature_extractor.feature(observation["image"]), observation["pad_states"]] # Initialize State self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8) self.state[0] = obs_array state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Generate an Action e-greedy action, q_now = self.q_net.e_greedy(state_, self.epsilon) return_action = action # Update for next step self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() self.last_observation = obs_array return return_action def agent_step(self, reward, observation): obs_array = np.r_[self.feature_extractor.feature(observation["image"]), observation["pad_states"]] #obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation if self.q_net.hist_size == 4: self.state = np.asanyarray( [self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 2: self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 1: self.state = np.asanyarray([obs_array], dtype=np.uint8) else: print("self.DQN.hist_size err") state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Exploration decays along the time sequence if self.policy_frozen is False: # Learning ON/OFF if self.q_net.initial_exploration < self.time: self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon else: # Initial Exploation Phase print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)), eps = 1.0 else: # Evaluation print("Policy is Frozen") eps = 0.05 # Generate an Action by e-greedy action selection action, q_now = self.q_net.e_greedy(state_, eps) return action, eps, q_now, obs_array def agent_step_update(self, reward, action, eps, q_now, obs_array): # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Simple text based visualization if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) print('Step:%d Reward:%f Epsilon:%.6f Q_max:%3f' % (self.time, reward, eps, q_max)) print('Action: {0}'.format(action)) # Updates for next step self.last_observation = obs_array if self.policy_frozen is False: self.last_action = copy.deepcopy(action) self.last_state = self.state.copy() self.time += 1 def agent_end(self, reward): # Episode Terminated print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon)) # Learning Phase if self.policy_frozen is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state, True) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Time count if self.policy_frozen is False: self.time += 1
class CnnDqnAgent(object): epsilon_delta = 1.0 / 10**4.4 #deltaの減少量 min_eps = 0.1 #deltaの最小値 actions = range(3) cnn_feature_extractor = 'alexnet_feature_extractor.pickle' #1 model = 'bvlc_alexnet.caffemodel' #2 model_type = 'alexnet' #3 image_feature_dim = 256 * 6 * 6 image_count = 1 def _observation_to_featurevec(self, observation): # TODO clean if self.image_count == 1: return np.r_[ self.feature_extractor.feature(observation["image"][0]), observation["depth"][0]] elif self.image_count == 4: return np.r_[ self.feature_extractor.feature(observation["image"][0]), self.feature_extractor.feature(observation["image"][1]), self.feature_extractor.feature(observation["image"][2]), self.feature_extractor.feature(observation["image"][3]), observation["depth"][0], observation["depth"][1], observation["depth"][2], observation["depth"][3]] else: print("not supported: number of camera") def agent_init(self, **options): try: self.image_count = options['image_count'] self.depth_image_dim = options['depth_image_dim'] self.use_gpu = options['use_gpu'] self.test = options['test'] self.folder = options["folder"] #save_modelで使う->self. model_num = options['model_num'] self.q_net_input_dim = self.image_feature_dim * self.image_count + self.depth_image_dim * self.image_count if os.path.exists(self.cnn_feature_extractor): print("loading... " + self.cnn_feature_extractor), self.feature_extractor = pickle.load( open(self.cnn_feature_extractor)) print("done") else: self.feature_extractor = CnnFeatureExtractor( self.use_gpu, self.model, self.model_type, self.image_feature_dim) pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w')) print("pickle.dump finished") self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim) self.time = model_num + 1 #saveとloadが同時に行われることを防ぐため if (self.test): self.epsilon = 0.0 else: non_exploration = max( self.time - self.q_net.initial_exploration, 0) self.epsilon = max(1.0 - non_exploration * self.epsilon_delta, self.min_eps) print "epsilon = ", self.epsilon if (self.test or model_num > 0): self.q_net.load_model(self.folder, model_num) except: import traceback import sys traceback.print_exc() sys.exit() # 行動取得系,state更新系メソッド def agent_start(self, observation): try: obs_array = self._observation_to_featurevec(observation) # Initialize State self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8) self.state[0] = obs_array state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Generate an Action e-greedy action, q_now = self.q_net.e_greedy(state_, self.epsilon) return_action = action # Update for next step self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() return return_action except: import traceback import sys traceback.print_exc() sys.exit() # 行動取得系,state更新系メソッド def agent_step(self, observation): try: obs_array = self._observation_to_featurevec(observation) # Compose State : 4-step sequential observation if self.q_net.hist_size == 4: self.state = np.asanyarray( [self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 2: self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8) elif self.q_net.hist_size == 1: self.state = np.asanyarray([obs_array], dtype=np.uint8) else: print("self.DQN.hist_size err") # q_funcに入れる際は(サンプル数,hist_size,q_net_input_dim) state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Exploration decays along the time sequence if self.test is False: # Learning ON/OFF if self.q_net.initial_exploration < self.time: #timeが1000を超えたら self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon #最初に1000回ランダムに行動 else: print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)), eps = 1.0 else: # Evaluation print("Policy is Frozen") eps = 0.0 # Generate an Action by e-greedy action selection action, q_now = self.q_net.e_greedy(state_, eps) return action, eps, q_now, obs_array except: import traceback import sys traceback.print_exc() sys.exit() # 学習系メソッド def agent_step_update(self, reward, action, eps, q_now): try: # Learning Phase if self.test is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() # Simple text based visualization if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) print('Step:%d Action:%d Reward:%.1f Epsilon:%.6f Q_max:%3f' % (self.time, self.q_net.action_to_index(action), reward, eps, q_max)) if self.test is False: self.last_action = copy.deepcopy(action) self.last_state = self.state.copy() # save model if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.save_model_freq) == 0: print "------------------Save Model------------------" self.q_net.save_model(self.folder, self.time) # Time count self.time += 1 except: import traceback import sys traceback.print_exc() sys.exit() # 学習系メソッド def agent_end(self, reward): # Episode Terminated try: print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon)) # Learning Phase if self.test is False: # Learning ON/OFF self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state, True) self.q_net.experience_replay(self.time) # Target model update if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.target_model_update_freq) == 0: print("Model Updated") self.q_net.target_model_update() if self.test is False: # Model Save if self.q_net.initial_exploration < self.time and np.mod( self.time, self.q_net.save_model_freq) == 0: print "------------------Save Model------------------" self.q_net.save_model(self.time, self.velocity) # Time count self.time += 1 except: import traceback import sys traceback.print_exc() sys.exit()