class ChainerDQNclass: STATE_FRAMES = 4 # number of frames to store in the state def __init__(self): self.num_of_actions = 4 print "Initializing DQN..." print "Model Building" self.model = Chain(l1=links.Convolution2D(self.STATE_FRAMES, 32, ksize=8, stride=4, nobias=False, wscale=0.01), l2=links.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=0.01), l3=links.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=0.01), l4=links.Linear(3136, 512, wscale=0.01), q_value=links.Linear(512, self.num_of_actions)).to_gpu() self.model_target = copy.deepcopy(self.model) # self.optimizer = optimizers.Adam(alpha=1e-6) # self.optimizer.use_cleargrads() # self.optimizer.setup(self.model) def Q_func(self, state): h1 = funcitons.relu(self.model.l1(state)) # scale inputs in [0.0 1.0] h2 = funcitons.relu(self.model.l2(h1)) h3 = funcitons.relu(self.model.l3(h2)) h4 = funcitons.relu(self.model.l4(h3)) Q = self.model.q_value(h4) return Q def Q_func_target(self, state): h1 = funcitons.relu( self.model_target.l1(state)) # scale inputs in [0.0 1.0] h2 = funcitons.relu(self.model_target.l2(h1)) h3 = funcitons.relu(self.model_target.l3(h2)) h4 = funcitons.relu(self.model_target.l4(h3)) Q = self.model_target.q_value(h4) return Q def target_model_update(self): self.model_target = copy.deepcopy(self.model)
class QNet: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**3 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 hist_size = 1 # original: 4 def __init__(self, use_gpu, enable_controller, dim, epsilon, epsilon_delta, min_eps): self.use_gpu = use_gpu self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller self.dim = dim self.epsilon = epsilon self.epsilon_delta = epsilon_delta self.min_eps = min_eps self.time = 0 app_logger.info("Initializing Q-Network...") hidden_dim = 256 self.model = Chain(l4=L.Linear(self.dim * self.hist_size, hidden_dim, initialW=I.HeNormal()), q_value=L.Linear( hidden_dim, self.num_of_actions, initialW=np.zeros( (self.num_of_actions, hidden_dim), dtype=np.float32))) if self.use_gpu >= 0: self.model.to_gpu() self.model_target = copy.deepcopy(self.model) self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.d = [ np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool) ] def forward(self, state, action, reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) q = self.q_func(s) # Get Q-value # Generate Target Signals tmp = self.q_func_target(s_dash) # Q(s',*) if self.use_gpu >= 0: tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) else: tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.asanyarray(q.data.get(), dtype=np.float32) else: # make new array target = np.array(q.data, dtype=np.float32) for i in range(num_of_batch): if not episode_end[i][0]: tmp_ = reward[i] + self.gamma * max_q_dash[i] else: tmp_ = reward[i] action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping if self.use_gpu >= 0: target = cuda.to_gpu(target) td = Variable(target) - q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32) if self.use_gpu >= 0: zero_val = cuda.to_gpu(zero_val) zero_val = Variable(zero_val) loss = F.mean_squared_error(td_clip, zero_val) return loss, q def q_func(self, state): h4 = F.relu(self.model.l4(state / 255.0)) q = self.model.q_value(h4) return q def q_func_target(self, state): h4 = F.relu(self.model_target.l4(state / 255.0)) q = self.model_target.q_value(h4) return q def e_greedy(self, state, epsilon): s = Variable(state) q = self.q_func(s) q = q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) app_logger.info(" Random") else: if self.use_gpu >= 0: index_action = np.argmax(q.get()) else: index_action = np.argmax(q) app_logger.info("#Greedy") return self.index_to_action(index_action), q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action) def start(self, feature): self.state = np.zeros((self.hist_size, self.dim), dtype=np.uint8) self.state[0] = feature state_ = np.asanyarray(self.state.reshape(1, self.hist_size, self.dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Generate an Action e-greedy action, q_now = self.e_greedy(state_, self.epsilon) return_action = action return return_action def update_model(self, replayed_experience): if replayed_experience[0]: self.optimizer.target.cleargrads() loss, _ = self.forward(replayed_experience[1], replayed_experience[2], replayed_experience[3], replayed_experience[4], replayed_experience[5]) loss.backward() self.optimizer.update() # Target model update if replayed_experience[0] and np.mod( self.time, self.target_model_update_freq) == 0: app_logger.info("Model Updated") self.target_model_update() self.time += 1 app_logger.info("step: {}".format(self.time)) def step(self, features): if self.hist_size == 4: self.state = np.asanyarray( [self.state[1], self.state[2], self.state[3], features], dtype=np.uint8) elif self.hist_size == 2: self.state = np.asanyarray([self.state[1], features], dtype=np.uint8) elif self.hist_size == 1: self.state = np.asanyarray([features], dtype=np.uint8) else: app_logger.error("self.DQN.hist_size err") state_ = np.asanyarray(self.state.reshape(1, self.hist_size, self.dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Exploration decays along the time sequence if self.initial_exploration < self.time: self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon else: # Initial Exploation Phase app_logger.info("Initial Exploration : {}/{} steps".format( self.time, self.initial_exploration)) eps = 1.0 # Generate an Action by e-greedy action selection action, q_now = self.e_greedy(state_, eps) if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) return action, eps, q_max
class QNet: # Hyper-Parameters gamma = 0.95 # Discount factor timestep_per_episode = 5000 initial_exploration = timestep_per_episode * 1 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size hist_size = 2 # original: 4 data_index = 0 data_flag = False loss_log = '../playground/Assets/log/' def __init__(self, use_gpu, enable_controller, cnn_input_dim, feature_dim, agent_count, other_input_dim, model): self.use_gpu = use_gpu self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller self.cnn_input_dim = cnn_input_dim self.feature_dim = feature_dim self.agent_count = agent_count self.other_input_dim = other_input_dim self.data_size = self.timestep_per_episode self.loss_log_file = self.loss_log + "loss.log" self.loss_per_episode = 0 self.time_of_episode = 0 print("Initializing Q-Network...") if model == 'None': self.model = Chain( conv1=L.Convolution2D(3 * self.hist_size, 32, 4, stride=2), bn1=L.BatchNormalization(32), conv2=L.Convolution2D(32, 32, 4, stride=2), bn2=L.BatchNormalization(32), conv3=L.Convolution2D(32, 32, 4, stride=2), bn3=L.BatchNormalization(32), # conv4=L.Convolution2D(64, 64, 4, stride=2), # bn4=L.BatchNormalization(64), l1=L.Linear( self.feature_dim + self.other_input_dim * self.hist_size, 128), l2=L.Linear(128, 128), l3=L.Linear(128, 96), l4=L.Linear(96, 64), q_value=L.Linear(64, self.num_of_actions)) else: with open(model, 'rb') as i: self.model = pickle.load(i) self.data_size = 0 if self.use_gpu >= 0: self.model.to_gpu() self.optimizer = optimizers.RMSpropGraves() self.optimizer.setup(self.model) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.d = [ np.zeros((self.agent_count, self.data_size, self.hist_size, 128, 128, 3), dtype=np.uint8), np.zeros((self.agent_count, self.data_size, self.hist_size, self.other_input_dim), dtype=np.uint8), np.zeros((self.agent_count, self.data_size), dtype=np.uint8), np.zeros((self.agent_count, self.data_size, 1), dtype=np.float32), np.zeros((self.agent_count, self.data_size, 1), dtype=np.bool) ] def _reshape_for_cnn(self, state, batch_size, hist_size, x, y): state_ = np.zeros((batch_size, 3 * hist_size, 128, 128), dtype=np.float32) for i in range(batch_size): if self.hist_size == 1: state_[i] = state[i][0].transpose(2, 0, 1) elif self.hist_size == 2: state_[i] = np.c_[state[i][0], state[i][1]].transpose(2, 0, 1) elif self.hist_size == 4: state_[i] = np.c_[state[i][0], state[i][1], state[i][2], state[i][3]].transpose(2, 0, 1) return state_ def forward(self, state_cnn, state_other, action, reward, state_cnn_dash, state_other_dash, episode_end): num_of_batch = state_cnn.shape[0] s_cnn = Variable(state_cnn) s_oth = Variable(state_other) s_cnn_dash = Variable(state_cnn_dash) s_oth_dash = Variable(state_other_dash) q = self.q_func(s_cnn, s_oth) # Get Q-value max_q_dash_ = self.q_func(s_cnn_dash, s_oth_dash) if self.use_gpu >= 0: tmp = list(map(np.max, max_q_dash_.data.get())) else: tmp = list(map(np.max, max_q_dash_.data)) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.array(q.data.get(), dtype=np.float32) else: target = np.array(q.data, dtype=np.float32) for i in range(num_of_batch): tmp_ = reward[i] + (1 - episode_end[i]) * self.gamma * max_q_dash[i] action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ if self.use_gpu >= 0: loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), q) else: loss = F.mean_squared_error(Variable(target), q) return loss, q def stock_experience(self, time, state_cnn, state_other, action, reward, state_cnn_dash, state_other_dash, episode_end_flag): for i in range(self.agent_count): self.d[0][i][self.data_index] = state_cnn[i].copy() self.d[1][i][self.data_index] = state_other[i].copy() self.d[2][i][self.data_index] = action[i].copy() self.d[3][i][self.data_index] = reward[i].copy() self.d[4][i][self.data_index] = episode_end_flag self.data_index += 1 if self.data_index >= self.data_size: self.data_index -= self.data_size self.data_flag = True def experience_replay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data replayRobotIndex = np.random.randint(0, self.agent_count, self.replay_size) if not self.data_flag: # during the first sweep of the History Data replay_index = np.random.randint(0, self.data_index, self.replay_size) else: replay_index = np.random.randint(0, self.data_size, self.replay_size) s_cnn_replay = np.ndarray(shape=(self.replay_size, self.hist_size, 128, 128, 3), dtype=np.float32) s_oth_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.other_input_dim), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_cnn_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, 128, 128, 3), dtype=np.float32) s_oth_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.other_input_dim), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in range(self.replay_size): s_cnn_replay[i] = np.asarray( (self.d[0][replayRobotIndex[i]][replay_index[i]]), dtype=np.float32) s_oth_replay[i] = np.asarray( (self.d[1][replayRobotIndex[i]][replay_index[i]]), dtype=np.float32) a_replay[i] = self.d[2][replayRobotIndex[i]][replay_index[i]] r_replay[i] = self.d[3][replayRobotIndex[i]][replay_index[i]] if (replay_index[i] + 1 >= self.data_size): s_cnn_dash_replay[i] = np.array( (self.d[0][replayRobotIndex[i]][replay_index[i] + 1 - self.data_size]), dtype=np.float32) s_oth_dash_replay[i] = np.array( (self.d[1][replayRobotIndex[i]][replay_index[i] + 1 - self.data_size]), dtype=np.float32) else: s_cnn_dash_replay[i] = np.array( (self.d[0][replayRobotIndex[i]][replay_index[i] + 1]), dtype=np.float32) s_oth_dash_replay[i] = np.array( (self.d[1][replayRobotIndex[i]][replay_index[i] + 1]), dtype=np.float32) episode_end_replay[i] = self.d[4][replayRobotIndex[i]][ replay_index[i]] s_cnn_replay = self._reshape_for_cnn(s_cnn_replay, self.replay_size, self.hist_size, 128, 128) s_cnn_dash_replay = self._reshape_for_cnn(s_cnn_dash_replay, self.replay_size, self.hist_size, 128, 128) s_cnn_replay /= 255.0 s_oth_replay /= 255.0 s_cnn_dash_replay /= 255.0 s_oth_dash_replay /= 255.0 if self.use_gpu >= 0: s_cnn_replay = cuda.to_gpu(s_cnn_replay) s_oth_replay = cuda.to_gpu(s_oth_replay) s_cnn_dash_replay = cuda.to_gpu(s_cnn_dash_replay) s_oth_dash_replay = cuda.to_gpu(s_oth_dash_replay) # Gradient-based update loss, _ = self.forward(s_cnn_replay, s_oth_replay, a_replay, r_replay, s_cnn_dash_replay, s_oth_dash_replay, episode_end_replay) send_loss = loss.data with open(self.loss_log_file, 'a') as the_file: the_file.write(str(time) + "," + str(send_loss) + "\n") self.loss_per_episode += loss.data self.time_of_episode += 1 self.model.zerograds() loss.backward() self.optimizer.update() def q_func(self, state_cnn, state_other): if self.use_gpu >= 0: num_of_batch = state_cnn.data.get().shape[0] else: num_of_batch = state_cnn.data.shape[0] h1 = F.tanh(self.model.bn1(self.model.conv1(state_cnn))) h2 = F.tanh(self.model.bn2(self.model.conv2(h1))) h3 = F.tanh(self.model.bn3(self.model.conv3(h2))) # h4 = F.tanh(self.model.bn4(self.model.conv4(h3))) # h5 = F.tanh(self.model.bn5(self.model.conv5(h4))) h4_ = F.concat( (F.reshape(h3, (num_of_batch, self.feature_dim)), F.reshape(state_other, (num_of_batch, self.other_input_dim * self.hist_size))), axis=1) h6 = F.relu(self.model.l1(h4_)) h7 = F.relu(self.model.l2(h6)) h8 = F.relu(self.model.l3(h7)) h9 = F.relu(self.model.l4(h8)) q = self.model.q_value(h9) return q def e_greedy(self, state_cnn, state_other, epsilon, reward): s_cnn = Variable(state_cnn) s_oth = Variable(state_other) q = self.q_func(s_cnn, s_oth) q = q.data if self.use_gpu >= 0: q_ = q.get() else: q_ = q index_action = np.zeros((self.agent_count), dtype=np.uint8) print(("agent"), end=' ') for i in range(self.agent_count): if np.random.rand() < epsilon: index_action[i] = np.random.randint(0, self.num_of_actions) print(("[%02d] Random(%2d)reward(%06.2f)" % (i, index_action[i], reward[i])), end=' ') else: index_action[i] = np.argmax(q_[i]) print(("[%02d]!Greedy(%2d)reward(%06.2f)" % (i, index_action[i], reward[i])), end=' ') if i % 5 == 4: print(("\n "), end=' ') del q_ return self.index_to_action(index_action), q def index_to_action(self, index_of_action): index = np.zeros((self.agent_count), dtype=np.uint8) for i in range(self.agent_count): index[i] = self.enable_controller[index_of_action[i]] return index def action_to_index(self, action): return self.enable_controller.index(action)