def create_function_nn(sim_config): function_config = dict() function_path = sim_config['Function_path'] path = os.path.join(os.path.dirname(os.path.abspath(__file__)), function_path) with open(path + '/agent_config.csv', 'r') as f: reader = csv.reader(f) for row in reader: funcion_config[row[0]] = row[1] func_config['min_epsilon'] = 0 func_config['epsilon_decaying_states'] = 0 func_config['min_D_size'] = 0 input_data = function_config['input_data'] num_history = int(function_config['agent_history']) num_peer = int(sim_config['num_peer']) if input_data == 'upload' or input_data == 'download': width = num_history height = num_peer - 1 elif input_data == 'upload_and_download': width = num_history height = (num_peer - 1) * 2 func_config['width'] = width func_config['height'] = height neural_network = DeepQNetwork(width, height, int(func_config['fc2_outputs']), function_config) sess = tf.InteractiveSession() sess.run(tf.global_variabels_initializer()) neural_network.restore_parameters( sess, os.path.join( path, 'model', 'train_network', 'train_network-' + str(sim_config['Function_restore_checkpoint']))) return neural_network, sess, func_config
class DQNAgent(Peer): def __init__(self, sim_config, agent_config, ID=0, strategy='Agent', training_flag=True): # parameters super().__init__(sim_config, ID, strategy) self.num_history = agent_config['agent_history'] self.training_flag = training_flag ## エージェントの行動の集合 self.enable_actions = list([i] for i in range(self.num_peer) if i != self.ID) self.enable_actions.insert(0, []) self.num_actions = self.num_peer ## 各パラメータの値を格納(詳細はtraining_config.pyに記載) self.minibatch_size = int(agent_config['minibatch_size']) self.learning_rate = float(agent_config['learning_rate']) self.discount_factor = float(agent_config['discount_factor']) self.max_D_size = int(agent_config['max_D_size']) self.min_D_size = int(agent_config['min_D_size']) self.network_update_frequency = int( agent_config['network_update_frequency']) self.epsilon_decaying_states = int( agent_config['epsilon_decaying_states']) self.min_epsilon = float(agent_config['min_epsilon']) self.reward_config = str(agent_config['reward_config']) self.momentum = float(agent_config['momentum']) self.opt_epsilon = float(agent_config['opt_epsilon']) # replay memory self.D = deque(maxlen=self.max_D_size) # variables self.current_loss = 0.0 self.current_Q_max = 0.0 self.num_total_states = 0 self.Q_max = 0 self.action_t = [] self.reward_t = [0] self.action_t_past = 0 # model self.graph = tf.Graph() with self.graph.as_default(): #input_layer self.input_data = agent_config['input_data'] if self.input_data == 'upload' or self.input_data == 'download': self.width = self.num_history self.height = self.num_peer - 1 elif self.input_data == 'upload_and_download': self.width = self.num_history self.height = (self.num_peer - 1) * 2 self.num_channels = int(agent_config['num_channels']) # train_network self.tf_train_input = tf.placeholder( tf.float32, shape=(self.minibatch_size, self.height, self.width, self.num_channels)) self.tf_train_target = tf.placeholder(tf.float32, shape=(self.minibatch_size, self.num_actions)) self.tf_filter_input = tf.placeholder(tf.float32, shape=(self.minibatch_size, self.num_actions)) self.train_network = DeepQNetwork(self.width, self.height, self.num_actions, agent_config) ### #self.train_q_values = self.train_network.q_values(self.tf_train_input) # target_network self.tf_target_input = tf.placeholder( tf.float32, shape=(self.minibatch_size, self.height, self.width, self.num_channels)) #self.tf_target_input = tf.placeholder(tf.float32, shape=(1, self.height, self.width, self.num_channels)) #self.tf_target_input = tf.placeholder(tf.float32, [self.minibatch_size, width, height]) self.target_network = DeepQNetwork(self.width, self.height, self.num_actions, agent_config) self.target_q_values = self.target_network.q_values( self.tf_target_input) # アクション選択用のプレースホルダー self.tf_action_selection_input = tf.placeholder( tf.float32, shape=(1, self.height, self.width, self.num_channels)) self.action_q_values = self.train_network.q_values( self.tf_action_selection_input) # loss function self.loss = self.train_network.clipped_loss( self.tf_train_input, self.tf_train_target, self.tf_filter_input) # optimizer self.optimizer = tf.train.RMSPropOptimizer( self.learning_rate, momentum=self.momentum, epsilon=self.opt_epsilon, name='RMSProp') self.training = self.optimizer.minimize(self.loss, name='training') self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.update_target_network() def update_target_network(self): self.train_network.copy_network_to(self.target_network, self.sess) def select_random_action(self, neighbor_leecher_list, unchoke_num): unchoke_num = self.calculate_unchoke_num(unchoke_num) return random.sample(neighbor_leecher_list, unchoke_num) def get_q_max(self): if self.num_history > len(self.download_history): return 0 else: state = self.get_cur_history() state = np.reshape(state, [1, self.height, self.width, self.num_channels]) q_values = self.action_q_values.eval( session=self.sess, feed_dict={self.tf_action_selection_input: state})[0] return np.max(q_values) def select_greedy_action(self, state, neighbor_leecher_list): q_values = self.action_q_values.eval( session=self.sess, feed_dict={self.tf_action_selection_input: state})[0] q_index = np.argsort(q_values)[::-1] for q in q_index: if self.enable_actions[q] == []: return [] elif self.enable_actions[q][0] in neighbor_leecher_list: return self.enable_actions[q] #self.action_t = self.enable_actions[q_index[0]] # action_t = [] # for i in q_index: # if i in neighbor_leecher_list: # action_t.append(self.enable_actions[i]) # if len(action_t) == self.num_unchoke: # return action_t # elif i == self.ID: # [action_t.append(self.enable_actions[i]) # return action_t def upload(self, status_list): self.enable_actions = list([i] for i in range(self.num_actions) if i != self.ID) self.enable_actions.insert(0, []) #print(self.enable_actions) epsilon = self.calculate_epsilon() neighbor_leecher_list = [ i for i in range(len(status_list)) if status_list[i] != 'Seeder' and i != self.ID ] self.unchoke_probability = 1 - (1 / (len(neighbor_leecher_list) + 1)) unchoke_num = min(self.num_unchoke, len(neighbor_leecher_list)) ### 隣接リーチャがいない時 if neighbor_leecher_list == []: self.action_t = [] return self.action_t ### 十分なデータが集まっていない時 elif len(self.download_history ) <= self.num_history or not self.has_enough_memory(): self.action_t = self.select_random_action(neighbor_leecher_list, unchoke_num) return self.action_t else: ### イプシロンの確率でランダム戦略 if random.random() <= epsilon: self.action_t = self.select_random_action( neighbor_leecher_list, unchoke_num) return self.action_t else: ### 直近の履歴情報を返す state = self.get_cur_history() state = np.reshape( state, [1, self.height, self.width, self.num_channels]) ### Q値によって選択 self.action_t = self.select_greedy_action( state, neighbor_leecher_list) return self.action_t def store_experience(self, state, action, reward, state_1, terminal): self.D.append((state, action, reward, state_1, terminal)) def experience_replay(self): state_minibatch = [] action_minibatch = [] reward_minibatch = [] state_1_minibatch = [] terminal_minibatch = [] # sample random minibatch minibatch_size = min(len(self.D), self.minibatch_size) minibatch_indexes = np.random.randint(0, len(self.D), minibatch_size) for j in minibatch_indexes: state_j, action_j, reward_j, state_j_1, terminal = self.D[j] action_j_index = action_j state_minibatch.append(state_j) action_minibatch.append(action_j) reward_minibatch.append(reward_j) state_1_minibatch.append(state_j_1) terminal_minibatch.append(terminal) state_minibatch = np.reshape( state_minibatch, [self.minibatch_size, self.height, self.width, self.num_channels]) state_1_minibatch = np.reshape( state_1_minibatch, [self.minibatch_size, self.height, self.width, self.num_channels]) ### target_networkでQ値を算出(教師データ) ### train_q_valuesはtarget_networkを使って学習を行わない(2013年版DQN) #target_qs = self.train_q_values.eval(feed_dict={self.tf_train_input: state_1_minibatch}) target_qs = self.target_q_values.eval( session=self.sess, feed_dict={self.tf_target_input: state_1_minibatch}) target = np.zeros(shape=(self.minibatch_size, self.num_actions), dtype=np.float32) q_value_filter = np.zeros(shape=(self.minibatch_size, self.num_actions), dtype=np.float32) for i in range(self.minibatch_size): terminal = terminal_minibatch[i] action_index = action_minibatch[i] #print(action_index) reward = reward_minibatch[i] target[i][ action_index] = reward if terminal else reward + self.learning_rate * np.max( target_qs[i]) q_value_filter[i][action_index] = 1.0 _, self.current_loss = self.sess.run( [self.training, self.loss], feed_dict={ self.tf_train_input: state_minibatch, self.tf_train_target: target, self.tf_filter_input: q_value_filter }) def load_model(self, file_path): self.train_network.restore_parameters(self.sess, file_path) ### CNNのモデルを保存 def save_model(self, save_path, num_episode): if not os.path.exists(os.path.join(save_path, 'model', 'train_network')): os.makedirs(os.path.join(save_path, 'model', 'train_network')) if not os.path.exists( os.path.join(save_path, 'model', 'target_network')): os.makedirs(os.path.join(save_path, 'model', 'target_network')) self.train_network.save_parameters( self.sess, save_path + '/model/train_network/train_network', num_episode) self.target_network.save_parameters( self.sess, save_path + '/model/target_network/train_network', num_episode) def get_cur_history(self): len_his = len(self.download_history) cur_up = copy.deepcopy(self.upload_history[len_his - self.num_history:]) cur_up = delete_row(cur_up, self.ID) cur_down = copy.deepcopy(self.download_history[len_his - self.num_history:]) cur_down = delete_row(cur_down, self.ID) if self.input_data == 'upload': return np.transpose(cur_up) elif self.input_data == 'download': return np.transpose(cur_down) elif self.input_data == 'upload_and_download': up_down = np.r_[cur_up, cur_down] up_down = up_down.transpose() if np.shape(up_down) == (self.num_peer - 1, self.num_history * 2): #print('ok ',np.shape(up_down)) up_down = up_down.reshape((self.num_peer - 1) * 2, self.num_history) else: #print('no ', np.shape(up_down)) up_down = None return up_down def calculate_epsilon(self): if self.epsilon_decaying_states == 0: return self.min_epsilon else: return max( self.min_epsilon, 1.0 - (self.num_total_states / self.epsilon_decaying_states)) def has_enough_memory(self): return len(self.D) >= self.min_D_size def next_step(self): if self.num_history < len( self.download_history) and self.training_flag: self.num_total_states += 1 #時刻tの状態, 行動, 報酬 state_t = self.state_t_past action_t = self.action_t_past state_t_1 = self.get_cur_history() #現在の状態を保存しておき, 次のステップで用いる self.state_t_past = state_t_1 self.action_t_past = self.action_t self.reward_t = [] if self.reward_config == 'all_download': self.reward_t.append(sum(self.current_download)) elif self.reward_config == 'each_download': for a_t in action_t: self.reward_t.append(self.current_download[a_t]) elif self.reward_config == 'each_download_penalty': for a_t in action_t: each_download = self.current_download[a_t] if a_t == self.ID: self.reward_t.append(0) elif each_download >= 1: self.reward_t.append(1) else: self.reward_t.append(-1) reward_t = self.reward_t #状態を更新 super().next_step() terminal = self.is_seeder() ## 行動とアクションを再生メモリに格納 self.store_experience(state_t, [self.enable_actions.index(action_t)], reward_t, state_t_1, terminal) ## 十分なメモリが揃ったら学習を開始 if self.has_enough_memory(): if (self.num_total_states % self.network_update_frequency) == 0: print('-----update_network------') self.update_target_network() self.experience_replay() else: self.state_t_past = self.get_cur_history() self.action_t_past = self.action_t super().next_step() ### オブジェクトをリセットしないため, このメソッドを呼び出して次のエピソードへ移行する. def reset_history(self): #持っているピース.デフォルトは0 self.have_piece = 0 #隣接ピアへのアップロード履歴を保存する変数(多分0か1しか検討しなさそうやけど) self.current_upload = [0 for i in range(self.num_peer)] #隣接ピアからの直近のダウンロード履歴を保存する変数 #初期はすべて1を入れてランダムに行うようにする(0入れて一番古いのを1にするのもあり) self.current_download = [0 for i in range(self.num_peer)] #隣接ピアからのダウンロード履歴を保存する変数 self.upload_history = [[0 for i in range(self.num_peer)]] #隣接ピアからのアップロード履歴を保存する変数 self.download_history = [[1 for i in range(self.num_peer)]] self.strategy = 'Agent'